Skip to content

Commit 5ee1d7a

Browse files
authored
Spec Review feedback (#34032)
* Spec Review feedback incorporated * Updating signature for code metric handler * Spec Review feedback incorporated * Updating signature for code metric handler * Rebased from main * Adding tests and user agent
1 parent bde74f3 commit 5ee1d7a

File tree

14 files changed

+251
-157
lines changed

14 files changed

+251
-157
lines changed

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_client/openai_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from openai import AsyncAzureOpenAI
88
from openai.types.chat.chat_completion import ChatCompletion
99

10-
from azure.ai.generative._user_agent import USER_AGENT
10+
from azure.ai.generative.evaluate._user_agent import USER_AGENT
1111
from azure.ai.generative.constants._common import USER_AGENT_HEADER_KEY
1212

1313
semaphore = asyncio.Semaphore(10)

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_evaluate.py

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,11 @@
99
import tempfile
1010
import time
1111
import logging
12+
from collections import Counter
1213
from json import JSONDecodeError
1314
from pathlib import Path
1415
from typing import Callable, Optional, Dict, List, Mapping
16+
from types import FunctionType
1517

1618
import mlflow
1719
import numpy as np
@@ -25,7 +27,8 @@
2527

2628
from azure.ai.generative.evaluate._metric_handler import MetricHandler
2729
from azure.ai.generative.evaluate._metrics_handler._code_metric_handler import CodeMetricHandler
28-
from azure.ai.generative.evaluate._utils import _is_flow, load_jsonl, _get_artifact_dir_path, _copy_artifact
30+
from azure.ai.generative.evaluate._utils import _is_flow, load_jsonl, _get_artifact_dir_path, _copy_artifact, \
31+
is_lambda_function
2932
from azure.ai.generative.evaluate._mlflow_log_collector import RedirectUserOutputStreams
3033
from azure.ai.generative.evaluate._constants import SUPPORTED_TO_METRICS_TASK_TYPE_MAPPING, SUPPORTED_TASK_TYPE, CHAT, \
3134
SUPPORTED_TASK_TYPE_TO_METRICS_MAPPING
@@ -84,25 +87,33 @@ def _log_metrics(run_id, metrics):
8487

8588

8689
def _validate_metrics(metrics, task_type):
87-
genai_metrics = []
90+
prompt_metrics = []
8891
builtin_metrics =[]
92+
code_metrics = []
8993
unknown_metrics = []
9094

9195
for metric in metrics:
92-
if isinstance(metric, GenAIMetric):
93-
genai_metrics.append(metric.name)
96+
if isinstance(metric, PromptMetric):
97+
prompt_metrics.append(metric)
9498
elif isinstance(metric, str) and metric in SUPPORTED_TASK_TYPE_TO_METRICS_MAPPING[task_type].SUPPORTED_LIST:
9599
builtin_metrics.append(metric)
100+
elif isinstance(metric, FunctionType):
101+
if is_lambda_function(metric):
102+
raise Exception("Lambda methods are not supported as code metrics")
103+
code_metrics.append(metric)
104+
96105
else:
97106
unknown_metrics.append(metric)
98107

99108
if len(unknown_metrics) > 0:
100109
raise Exception("Unsupported metric found in the list")
101110

102-
# if len(set(genai_metrics) & set(builtin_metrics)) > 0:
103-
if len(genai_metrics) != len(set(genai_metrics)) or len(builtin_metrics) != len(set(builtin_metrics))\
104-
or (len(set(genai_metrics) & set(builtin_metrics)) > 0):
105-
raise Exception("Duplicate metric name found. Metric names should be unique")
111+
counter = Counter(builtin_metrics + [metric.name for metric in prompt_metrics] + [metric.__name__ for metric in code_metrics])
112+
duplicates = [key for key, value in counter.items() if value > 1]
113+
if len(duplicates) > 0:
114+
raise Exception(f"Duplicate metric name found {duplicates}. Metric names should be unique")
115+
116+
return builtin_metrics, prompt_metrics, code_metrics
106117

107118

108119
@distributed_trace
@@ -275,21 +286,20 @@ def _evaluate(
275286
if metrics is None:
276287
metrics = SUPPORTED_TASK_TYPE_TO_METRICS_MAPPING[task_type].DEFAULT_LIST
277288

278-
_validate_metrics(metrics, task_type)
289+
inbuilt_metrics, custom_prompt_metrics, code_metrics = _validate_metrics(metrics, task_type)
279290

280-
inbuilt_metrics = [metric for metric in metrics if not isinstance(metric, GenAIMetric)]
281-
custom_prompt_metrics = [metric for metric in metrics if isinstance(metric, PromptMetric)]
282-
code_metrics = [metric for metric in metrics if isinstance(metric, CodeMetric)]
291+
# TODO : Once PF is used for inbuilt metrics parallelize submission of metrics calculation of different kind
283292

284293
if custom_prompt_metrics:
285294
for metric in custom_prompt_metrics:
286-
metrics_config.setdefault(metric.name, {param: param for param in metric.parameters})
295+
metrics_config.setdefault(metric.name, {param: param for param in metric._template_variable})
287296

288297
prompt_metric_handler = PromptMetricHandler(
289298
task_type="custom-prompt-metric",
290299
metrics=custom_prompt_metrics,
291300
prediction_data=asset_handler.prediction_data,
292301
test_data=asset_handler.test_data,
302+
input_output_data=asset_handler.input_output_data,
293303
metrics_mapping=metrics_config,
294304
)
295305

@@ -302,8 +312,9 @@ def _evaluate(
302312
if code_metrics:
303313
code_metric_handler = CodeMetricHandler(
304314
task_type="custom-code-metric",
305-
metrics=code_metrics,
315+
metrics=[CodeMetric(name=metric.__name__, calculate=metric) for metric in code_metrics],
306316
prediction_data=asset_handler.prediction_data,
317+
input_output_data=asset_handler.input_output_data,
307318
test_data=asset_handler.test_data,
308319
metrics_mapping=metrics_config,
309320
)

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_local_flow_handler.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import pandas as pd
66

77
from ._base_handler import BaseHandler
8+
from ._user_agent import USER_AGENT
89
from ._utils import df_to_dict_list, run_pf_flow_with_dict_list, wait_for_pf_run_to_complete
910

1011

@@ -35,7 +36,9 @@ def execute_target(self):
3536
wait_for_pf_run_to_complete(pf_run_result.name)
3637

3738
logger.debug("PF run results: %s", pf_run_result.properties)
38-
pf_client = PFClient()
39+
pf_client = PFClient(
40+
user_agent=USER_AGENT
41+
)
3942
result_df = pf_client.get_details(pf_run_result.name, all_results=True)
4043

4144
# Rename inputs columns. E.g. inputs.question -> question

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_metric_handler.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,19 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
44
import pandas as pd
5+
import logging
56

67
from os import path
78
from typing import Dict, Optional
89

910
from azure.ai.generative.evaluate._constants import TASK_TYPE_TO_METRICS_MAPPING
11+
from ._user_agent import USER_AGENT
12+
1013
from ._utils import run_pf_flow_with_dict_list, df_to_dict_list, wait_for_pf_run_to_complete
1114

15+
LOGGER = logging.getLogger(__name__)
16+
17+
1218
class MetricHandler(object):
1319

1420
def __init__(
@@ -49,7 +55,9 @@ def calculate_metrics(self) -> Dict:
4955
from promptflow import PFClient
5056
from promptflow.entities import AzureOpenAIConnection, OpenAIConnection
5157

52-
pf_client = PFClient()
58+
pf_client = PFClient(
59+
user_agent=USER_AGENT
60+
)
5361

5462
openai_config = self.metrics_mapping["openai_params"]
5563
conn_name = "openai_connection"

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_metrics_handler/_code_metric_handler.py

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ def __init__(
2020
task_type,
2121
prediction_data,
2222
test_data,
23+
input_output_data,
2324
metrics_mapping=None,
2425
metrics=None,
2526
):
@@ -30,6 +31,7 @@ def __init__(
3031
test_data=test_data,
3132
metrics_mapping=metrics_mapping,
3233
metrics=metrics,
34+
input_output_data=input_output_data,
3335
)
3436

3537
self._validate()
@@ -84,7 +86,7 @@ def _calculate_metric(self, metric, data, response):
8486
with ThreadPoolExecutor(thread_name_prefix="code_metrics_row") as thread_pool:
8587
for i in range(0, len(data)):
8688
row_metric_futures.append(thread_pool.submit(
87-
self._submit_method, metric.calculate, data=data[i], response=response[i]
89+
self._submit_method, metric.calculate, data={**data[i], **response[i]}
8890
))
8991

9092
for row_metric_future in row_metric_futures:
@@ -107,18 +109,4 @@ def _calculate_metric(self, metric, data, response):
107109
{metric.name: row_metric_results}
108110
)
109111

110-
if metric.aggregator:
111-
try:
112-
aggregated_values = self._submit_method(
113-
metric.aggregator,
114-
values=results.get("artifacts").get(metric.name)
115-
)
116-
results["metrics"].update(
117-
{
118-
f"{key}_{metric.name}": value for key, value in aggregated_values.items()
119-
}
120-
)
121-
except Exception as ex:
122-
LOGGER.info(
123-
f"Error aggregating values for metric {metric.name} , failed with error {str(ex)} : Stack trace : {str(ex.__traceback__)}")
124112
return results

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_metrics_handler/_prompt_metric_handler.py

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,20 +9,25 @@
99
import pandas as pd
1010
import logging
1111
import tqdm.asyncio
12+
from numpy import NaN
1213

1314
from .._client.openai_client import AzureOpenAIClient
1415
from .._metric_handler import MetricHandler
1516
from ..metrics._custom_metric import PromptMetric
17+
from ..metrics._parsers import JsonParser, NumberParser
1618

1719
LOGGER = logging.getLogger(__name__)
1820

21+
SUPPORTED_PARSERS = [JsonParser, NumberParser]
22+
1923

2024
class PromptMetricHandler(MetricHandler):
2125
def __init__(
2226
self,
2327
task_type,
2428
prediction_data,
2529
test_data,
30+
input_output_data,
2631
metrics_mapping=None,
2732
metrics=None,
2833
):
@@ -32,6 +37,7 @@ def __init__(
3237
test_data=test_data,
3338
metrics_mapping=metrics_mapping,
3439
metrics=metrics,
40+
input_output_data=input_output_data,
3541
)
3642

3743
self._validate()
@@ -43,6 +49,7 @@ def _validate(self):
4349
raise Exception \
4450
(f"{self.__class__.__name__} supports only {PromptMetric.__class__.__name__} type of metrics")
4551

52+
4653
def _convert_metric_to_message(self, metric, data):
4754
from jinja2 import Template
4855

@@ -78,11 +85,32 @@ def _get_data_for_metric(self, metric):
7885

7986
return data_as_jsonl
8087

88+
def _parser_response(self, value, metric):
89+
result = {metric.name: NaN}
90+
parsed_value = None
91+
92+
for parser in SUPPORTED_PARSERS:
93+
parsed_value = parser.parse(value)
94+
if parsed_value:
95+
result = parsed_value
96+
break
97+
98+
if parsed_value:
99+
if isinstance(parsed_value, dict):
100+
result = {f"{metric.name}_{key}": value for key, value in parsed_value.items()}
101+
else:
102+
result = {metric.name: parsed_value}
103+
104+
if parsed_value is None:
105+
LOGGER.debug("Result from LLM should be in json format or a number")
106+
107+
return result
108+
81109
async def _compute_metric_row(self, metric, data):
82110
message = self._convert_metric_to_message(metric, data)
83111
response = await self._client.bounded_chat_completion(message)
84112
content = self._client.get_chat_completion_content_from_response(response)
85-
result = metric._parser.parse(content if content is not None else response, metric)
113+
result = self._parser_response(content if content is not None else response, metric)
86114
return result
87115

88116
async def _compute_metric(self, metric):
@@ -101,16 +129,6 @@ async def _compute_metric(self, metric):
101129
key: [row[key] for row in responses]
102130
})
103131

104-
if metric.aggregator:
105-
aggregated_values = metric.aggregator(
106-
values=results.get("artifacts").get(metric.name)
107-
)
108-
results["metrics"].update(
109-
{
110-
f"{key}_{metric.name}": value for key, value in aggregated_values.items()
111-
}
112-
)
113-
114132
return results
115133

116134
async def _compute_metrics(self, metrics):
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
from azure.ai.generative._version import VERSION
5+
6+
USER_AGENT = "{}/{} {}/{}".format("azure-ai-generative", VERSION, "evaluate", VERSION)

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,3 +186,7 @@ def _copy_artifact(source, destination):
186186

187187
pathlib.Path(destination).mkdir(exist_ok=True, parents=True)
188188
shutil.copy2(source, destination)
189+
190+
191+
def is_lambda_function(obj):
192+
return callable(obj) and obj.__name__ == "<lambda>"

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/metrics/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,8 @@
44

55
__path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore
66

7-
from ._custom_metric import CodeMetric, PromptMetric
7+
from ._custom_metric import PromptMetric
88

99
__all__ = [
10-
"CodeMetric",
1110
"PromptMetric"
1211
]

sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/metrics/_aggregators.py

Lines changed: 0 additions & 24 deletions
This file was deleted.

0 commit comments

Comments
 (0)