Skip to content

Commit 589b703

Browse files
authored
Use PF to run evaluation (#33707)
* pf template * Use PF to run evaluation
1 parent 43452dd commit 589b703

30 files changed

+908
-261
lines changed

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_base_handler.py

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,19 +9,18 @@ class BaseHandler(metaclass=abc.ABCMeta):
99

1010
def __init__(self, asset, test_data, prediction_data=None, ground_truth=None, **kwargs):
1111
self._prediction_data = None
12+
self._input_output_data = None
1213
self.asset = asset
1314

1415
test_data_df = pd.DataFrame(test_data)
16+
if self.asset is None:
17+
self._input_output_data = test_data_df
18+
self._prediction_data = test_data_df
1519

1620
if isinstance(prediction_data, str) and prediction_data in test_data_df.columns:
1721
self._prediction_data = test_data_df[[prediction_data]]
1822
test_data_df = test_data_df.drop(prediction_data, axis=1)
1923

20-
self._ground_truth = None
21-
if isinstance(ground_truth, str) and ground_truth in test_data_df.columns:
22-
self._ground_truth = test_data_df[[ground_truth]]
23-
test_data_df = test_data_df.drop(ground_truth, axis=1)
24-
2524
self._test_data = test_data_df
2625

2726
self.params_dict = kwargs.pop("params_dict", None)
@@ -33,23 +32,19 @@ def test_data(self):
3332
@property
3433
def prediction_data(self):
3534
if self._prediction_data is None:
36-
prediction_data = self.generate_prediction_data()
37-
prediction_data_df = pd.DataFrame(prediction_data)
38-
self._prediction_data = prediction_data_df
35+
self.execute_target()
3936
return self._prediction_data
4037

4138
@property
42-
def ground_truth(self):
43-
return self._ground_truth
39+
def input_output_data(self):
40+
if self._input_output_data is None:
41+
self.execute_target()
42+
return self._input_output_data
43+
4444

4545
@abc.abstractmethod
46-
def generate_prediction_data(self):
46+
def execute_target(self):
4747
"""
48-
Abstract method to generated prediction data.
48+
Abstract method to generated prediction data and input output data.
4949
Should be implemented by all subclasses.
50-
"""
51-
52-
def get_test_data_as_jsonl(self):
53-
if self.params_dict:
54-
return self.test_data.assign(**self.params_dict).to_dict("records")
55-
return self.test_data.to_dict("records")
50+
"""

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_constants.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
44

5+
from typing import Dict, Union
6+
57
QA = "qa"
68
CHAT = "chat"
79

@@ -12,11 +14,6 @@
1214
CHAT: "rag-evaluation",
1315
}
1416

15-
TYPE_TO_KWARGS_MAPPING = {
16-
"qa": ["questions", "contexts", "y_pred", "y_test"],
17-
"rag-evaluation": ["y_pred"]
18-
}
19-
2017

2118
class EvaluationMetrics:
2219
"""
@@ -71,9 +68,9 @@ class ChatMetrics:
7168
]
7269

7370

74-
TASK_TYPE_TO_METRICS_MAPPING = {
75-
"qa": QaMetrics,
76-
"rag-evaluation": ChatMetrics
71+
TASK_TYPE_TO_METRICS_MAPPING: Dict[str, Union[QaMetrics, ChatMetrics]] = {
72+
"qa": QaMetrics(),
73+
"rag-evaluation": ChatMetrics()
7774
}
7875

7976
SUPPORTED_TASK_TYPE_TO_METRICS_MAPPING = {

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_data_fetcher.py

Lines changed: 0 additions & 34 deletions
This file was deleted.

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_evaluate.py

Lines changed: 25 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
44
import copy
5+
from hmac import new
56
import json
67
import os
78
import shutil
@@ -27,7 +28,7 @@
2728
from azure.ai.generative.evaluate._utils import _is_flow, load_jsonl, _get_artifact_dir_path, _copy_artifact
2829
from azure.ai.generative.evaluate._mlflow_log_collector import RedirectUserOutputStreams
2930
from azure.ai.generative.evaluate._constants import SUPPORTED_TO_METRICS_TASK_TYPE_MAPPING, SUPPORTED_TASK_TYPE, CHAT, \
30-
TYPE_TO_KWARGS_MAPPING, SUPPORTED_TASK_TYPE_TO_METRICS_MAPPING
31+
SUPPORTED_TASK_TYPE_TO_METRICS_MAPPING
3132
from azure.ai.generative.evaluate._evaluation_result import EvaluationResult
3233
from ._metrics_handler._prompt_metric_handler import PromptMetricHandler
3334

@@ -67,27 +68,6 @@ def _get_metric_handler_class(
6768
return handler
6869

6970

70-
def _validate_data(data, prediction_data, truth_data):
71-
errors = []
72-
prediction_data_column = ""
73-
truth_data_column = ""
74-
75-
if isinstance(prediction_data, str):
76-
prediction_data_column = data[0].get(prediction_data, None)
77-
78-
if isinstance(truth_data, str):
79-
truth_data_column = data[0].get(truth_data, None)
80-
81-
if prediction_data_column is None:
82-
errors.append("prediction_data column not found in data")
83-
84-
if truth_data_column is None:
85-
errors.append("truth_data column not found in data")
86-
87-
if len(errors) > 1:
88-
raise Exception(f'Invalid data {" ,".join(errors)}')
89-
90-
9171
def _log_metrics(run_id, metrics):
9272
"""
9373
Helper method to log metrics into specified run.
@@ -135,7 +115,7 @@ def evaluate(
135115
task_type: Optional[str] = None,
136116
metrics_list: Optional[List[str]] = None,
137117
model_config: Optional[Dict[str, str]] = None,
138-
data_mapping: Optional[Mapping] = None,
118+
data_mapping: Optional[Dict[str, str]] = None,
139119
output_path: Optional[str] = None,
140120
**kwargs
141121
):
@@ -154,9 +134,9 @@ def evaluate(
154134
:keyword metrics_list: List of metrics to calculate. A default list is picked based on task_type if not set.
155135
:paramtype metrics_list: Optional[List[str]]
156136
:keyword model_config: GPT configuration details needed for AI-assisted metrics.
157-
:paramtype model_config: Dict[str, str]
137+
:paramtype model_config: Optional[Dict[str, str]]
158138
:keyword data_mapping: GPT configuration details needed for AI-assisted metrics.
159-
:paramtype data_mapping: typing.Mapping
139+
:paramtype data_mapping: Optional[Dict[str, str]]
160140
:keyword output_path: The local folder path to save evaluation artifacts to if set
161141
:paramtype output_path: Optional[str]
162142
:keyword tracking_uri: Tracking uri to log evaluation results to AI Studio
@@ -182,8 +162,20 @@ def evaluate(
182162
if model_config:
183163
metrics_config.update({"openai_params": model_config})
184164

165+
185166
if data_mapping:
186-
metrics_config.update(data_mapping)
167+
import warnings
168+
169+
new_data_mapping = dict(data_mapping)
170+
if "y_pred" in new_data_mapping:
171+
warnings.warn("y_pred is deprecated, please use \"answer\" instead")
172+
value = data_mapping.pop("y_pred")
173+
new_data_mapping.update({"answer": value})
174+
if "y_test" in new_data_mapping:
175+
warnings.warn("y_test is deprecated, please use \"ground_truth\" instead")
176+
value = data_mapping.pop("y_test")
177+
new_data_mapping.update({"ground_truth": value})
178+
data_mapping = new_data_mapping
187179

188180
sweep_args = kwargs.pop("sweep_args", None)
189181
if sweep_args:
@@ -230,8 +222,6 @@ def _evaluate(
230222
evaluation_name=None,
231223
target=None,
232224
data=None,
233-
truth_data=None,
234-
prediction_data=None,
235225
task_type=None,
236226
metrics=None,
237227
data_mapping=None,
@@ -248,14 +238,8 @@ def _evaluate(
248238
test_data = data
249239
_data_is_file = False
250240

251-
if "y_pred" in data_mapping:
252-
prediction_data = data_mapping.get("y_pred")
253-
254-
if "y_test" in data_mapping:
255-
truth_data = data_mapping.get("y_test")
256-
257-
if target is None and prediction_data is None:
258-
raise Exception("target and prediction data cannot be null")
241+
if "answer" in data_mapping:
242+
prediction_data = data_mapping.get("answer")
259243

260244
if task_type not in SUPPORTED_TASK_TYPE:
261245
raise Exception(f"task type {task_type} is not supported")
@@ -281,8 +265,6 @@ def _evaluate(
281265

282266
asset_handler = asset_handler_class(
283267
asset=target,
284-
prediction_data=prediction_data,
285-
ground_truth=truth_data,
286268
test_data=test_data,
287269
metrics_config=metrics_config,
288270
**kwargs
@@ -299,8 +281,6 @@ def _evaluate(
299281
custom_prompt_metrics = [metric for metric in metrics if isinstance(metric, PromptMetric)]
300282
code_metrics = [metric for metric in metrics if isinstance(metric, CodeMetric)]
301283

302-
# TODO : Once PF is used for inbuilt metrics parallelize submission of metrics calculation of different kind
303-
304284
if custom_prompt_metrics:
305285
for metric in custom_prompt_metrics:
306286
metrics_config.setdefault(metric.name, {param: param for param in metric.parameters})
@@ -309,12 +289,8 @@ def _evaluate(
309289
task_type="custom-prompt-metric",
310290
metrics=custom_prompt_metrics,
311291
prediction_data=asset_handler.prediction_data,
312-
truth_data=asset_handler.ground_truth,
313292
test_data=asset_handler.test_data,
314293
metrics_mapping=metrics_config,
315-
prediction_data_column_name=prediction_data if isinstance(prediction_data, str) else None,
316-
ground_truth_column_name=truth_data if isinstance(truth_data, str) else None,
317-
type_to_kwargs="custom-prompt-metric"
318294
)
319295

320296
prompt_metric_results = prompt_metric_handler.calculate_metrics()
@@ -328,12 +304,8 @@ def _evaluate(
328304
task_type="custom-code-metric",
329305
metrics=code_metrics,
330306
prediction_data=asset_handler.prediction_data,
331-
truth_data=asset_handler.ground_truth,
332307
test_data=asset_handler.test_data,
333308
metrics_mapping=metrics_config,
334-
prediction_data_column_name=prediction_data if isinstance(prediction_data, str) else None,
335-
ground_truth_column_name=truth_data if isinstance(truth_data, str) else None,
336-
type_to_kwargs="code-prompt-metric"
337309
)
338310

339311
code_metric_results = code_metric_handler.calculate_metrics()
@@ -347,12 +319,10 @@ def _evaluate(
347319
task_type=SUPPORTED_TO_METRICS_TASK_TYPE_MAPPING[task_type],
348320
metrics=inbuilt_metrics,
349321
prediction_data=asset_handler.prediction_data,
350-
truth_data=asset_handler.ground_truth,
322+
input_output_data=asset_handler.input_output_data,
351323
test_data=asset_handler.test_data,
352324
metrics_mapping=metrics_config,
353-
prediction_data_column_name=prediction_data if isinstance(prediction_data, str) else None,
354-
ground_truth_column_name=truth_data if isinstance(truth_data, str) else None,
355-
type_to_kwargs=TYPE_TO_KWARGS_MAPPING[task_type]
325+
data_mapping=data_mapping,
356326
)
357327

358328
inbuilt_metrics_results = inbuilt_metrics_handler.calculate_metrics()
@@ -393,6 +363,7 @@ def _evaluate(
393363
eval_artifact_df = _get_instance_table(metrics_results, task_type, asset_handler).to_json(orient="records",
394364
lines=True,
395365
force_ascii=False)
366+
# eval_artifact_df = result.to_json(orient="records", lines=True, force_ascii=False)
396367
tmp_path = os.path.join(tmpdir, "eval_results.jsonl")
397368

398369
with open(tmp_path, "w", encoding="utf-8") as f:
@@ -480,22 +451,14 @@ def _get_chat_instance_table(metrics):
480451

481452

482453
def _get_instance_table(metrics, task_type, asset_handler):
483-
if metrics.get("artifacts"):
484-
metrics.get("artifacts").pop("bertscore", None)
454+
485455
if task_type == CHAT:
486456
instance_level_metrics_table = _get_chat_instance_table(metrics.get("artifacts"))
487457
else:
488458
instance_level_metrics_table = pd.DataFrame(metrics.get("artifacts"))
489459

490-
prediction_data = asset_handler.prediction_data
491-
for column in asset_handler.prediction_data.columns.values:
492-
if column in asset_handler.test_data.columns.values:
493-
prediction_data.drop(column, axis=1, inplace=True)
494-
495460
combined_table = pd.concat(
496-
[asset_handler.test_data,
497-
prediction_data,
498-
asset_handler.ground_truth,
461+
[asset_handler.input_output_data,
499462
instance_level_metrics_table
500463
],
501464
axis=1,

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_local_code_handler.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
# ---------------------------------------------------------
44

55
import logging
6+
import pandas as pd
67

78
from azure.ai.generative.evaluate._base_handler import BaseHandler
9+
from ._utils import df_to_dict_list
810

911
logger = logging.getLogger(__name__)
1012

@@ -22,21 +24,27 @@ def __init__(self, asset, test_data, prediction_data=None, ground_truth=None, **
2224
**kwargs
2325
)
2426

25-
def generate_prediction_data(self):
26-
# TODO: Check if this is the right place for this logic
27+
def execute_target(self):
2728
prediction_data = []
28-
test_data = self.get_test_data_as_jsonl()
29+
input_output_data = []
30+
test_data = df_to_dict_list(self.test_data, self.params_dict)
2931

3032
import inspect
3133
is_asset_async = False
3234
if inspect.iscoroutinefunction(self.asset):
3335
is_asset_async = True
3436
import asyncio
3537

36-
for d in test_data:
37-
prediction_data.append(
38-
asyncio.run(self.asset(**d)) if is_asset_async else self.asset(**d)
39-
)
38+
for input in test_data:
39+
# The assumption here is target function returns a dict with output keys
40+
fn_output = asyncio.run(self.asset(**input)) if is_asset_async else self.asset(**input)
41+
42+
prediction_data.append(fn_output)
43+
# When input and output have a common key, value in output overrides value in input
44+
input_output = dict(input)
45+
input_output.update(fn_output)
46+
input_output_data.append(input_output)
4047

41-
42-
return prediction_data
48+
49+
self._prediction_data = pd.DataFrame(prediction_data)
50+
self._input_output_data = pd.DataFrame(input_output_data)

0 commit comments

Comments
 (0)