Azure
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_client/openai_client.py‎
Lines changed: 1 addition & 1 deletion b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_client/openai_client.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_evaluate.py‎
Lines changed: 25 additions & 14 deletions b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_evaluate.py‎
Lines changed: 25 additions & 14 deletions
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_local_flow_handler.py‎
Lines changed: 4 additions & 1 deletion b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_local_flow_handler.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_metric_handler.py‎
Lines changed: 9 additions & 1 deletion b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_metric_handler.py‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_metrics_handler/_code_metric_handler.py‎
Lines changed: 3 additions & 15 deletions b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_metrics_handler/_code_metric_handler.py‎
Lines changed: 3 additions & 15 deletions
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_metrics_handler/_prompt_metric_handler.py‎
Lines changed: 29 additions & 11 deletions b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_metrics_handler/_prompt_metric_handler.py‎
Lines changed: 29 additions & 11 deletions
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_user_agent.py‎
Lines changed: 6 additions & 0 deletions b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_user_agent.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_utils.py‎
Lines changed: 4 additions & 0 deletions b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_utils.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/metrics/__init__.py‎
Lines changed: 1 addition & 2 deletions b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/metrics/__init__.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/metrics/_aggregators.py‎
Lines changed: 0 additions & 24 deletions b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/metrics/_aggregators.py‎
Lines changed: 0 additions & 24 deletions
@@ -7,7 +7,7 @@
 from openai import AsyncAzureOpenAI
 from openai.types.chat.chat_completion import ChatCompletion
 
-from azure.ai.generative._user_agent import USER_AGENT
+from azure.ai.generative.evaluate._user_agent import USER_AGENT
 from azure.ai.generative.constants._common import USER_AGENT_HEADER_KEY
 
 semaphore = asyncio.Semaphore(10)
 
@@ -9,9 +9,11 @@
 import tempfile
 import time
 import logging
+from collections import Counter
 from json import JSONDecodeError
 from pathlib import Path
 from typing import Callable, Optional, Dict, List, Mapping
+from types import FunctionType
 
 import mlflow
 import numpy as np
@@ -25,7 +27,8 @@
 
 from azure.ai.generative.evaluate._metric_handler import MetricHandler
 from azure.ai.generative.evaluate._metrics_handler._code_metric_handler import CodeMetricHandler
-from azure.ai.generative.evaluate._utils import _is_flow, load_jsonl, _get_artifact_dir_path, _copy_artifact
+from azure.ai.generative.evaluate._utils import _is_flow, load_jsonl, _get_artifact_dir_path, _copy_artifact, \
+    is_lambda_function
 from azure.ai.generative.evaluate._mlflow_log_collector import RedirectUserOutputStreams
 from azure.ai.generative.evaluate._constants import SUPPORTED_TO_METRICS_TASK_TYPE_MAPPING, SUPPORTED_TASK_TYPE, CHAT, \
     SUPPORTED_TASK_TYPE_TO_METRICS_MAPPING
@@ -84,25 +87,33 @@ def _log_metrics(run_id, metrics):
 
 
 def _validate_metrics(metrics, task_type):
-    genai_metrics = []
+    prompt_metrics = []
     builtin_metrics =[]
+    code_metrics = []
     unknown_metrics = []
 
     for metric in metrics:
-        if isinstance(metric, GenAIMetric):
-            genai_metrics.append(metric.name)
+        if isinstance(metric, PromptMetric):
+            prompt_metrics.append(metric)
         elif isinstance(metric, str) and metric in SUPPORTED_TASK_TYPE_TO_METRICS_MAPPING[task_type].SUPPORTED_LIST:
             builtin_metrics.append(metric)
+        elif isinstance(metric, FunctionType):
+            if is_lambda_function(metric):
+                raise Exception("Lambda methods are not supported as code metrics")
+            code_metrics.append(metric)
+
         else:
             unknown_metrics.append(metric)
 
     if len(unknown_metrics) > 0:
         raise Exception("Unsupported metric found in the list")
 
-    # if len(set(genai_metrics) & set(builtin_metrics)) > 0:
-    if len(genai_metrics) != len(set(genai_metrics)) or len(builtin_metrics) != len(set(builtin_metrics))\
-            or (len(set(genai_metrics) & set(builtin_metrics)) > 0):
-        raise Exception("Duplicate metric name found. Metric names should be unique")
+    counter = Counter(builtin_metrics + [metric.name for metric in prompt_metrics] + [metric.__name__ for metric in code_metrics])
+    duplicates = [key for key, value in counter.items() if value > 1]
+    if len(duplicates) > 0:
+        raise Exception(f"Duplicate metric name found {duplicates}. Metric names should be unique")
+
+    return builtin_metrics, prompt_metrics, code_metrics
 
 
 @distributed_trace
@@ -275,21 +286,20 @@ def _evaluate(
         if metrics is None:
             metrics = SUPPORTED_TASK_TYPE_TO_METRICS_MAPPING[task_type].DEFAULT_LIST
 
-        _validate_metrics(metrics, task_type)
+        inbuilt_metrics, custom_prompt_metrics, code_metrics = _validate_metrics(metrics, task_type)
 
-        inbuilt_metrics = [metric for metric in metrics if not isinstance(metric, GenAIMetric)]
-        custom_prompt_metrics = [metric for metric in metrics if isinstance(metric, PromptMetric)]
-        code_metrics = [metric for metric in metrics if isinstance(metric, CodeMetric)]
+        # TODO : Once PF is used for inbuilt metrics parallelize submission of metrics calculation of different kind
 
         if custom_prompt_metrics:
             for metric in custom_prompt_metrics:
-                metrics_config.setdefault(metric.name, {param: param for param in metric.parameters})
+                metrics_config.setdefault(metric.name, {param: param for param in metric._template_variable})
 
             prompt_metric_handler = PromptMetricHandler(
                 task_type="custom-prompt-metric",
                 metrics=custom_prompt_metrics,
                 prediction_data=asset_handler.prediction_data,
                 test_data=asset_handler.test_data,
+                input_output_data=asset_handler.input_output_data,
                 metrics_mapping=metrics_config,
             )
 
@@ -302,8 +312,9 @@ def _evaluate(
         if code_metrics:
             code_metric_handler = CodeMetricHandler(
                 task_type="custom-code-metric",
-                metrics=code_metrics,
+                metrics=[CodeMetric(name=metric.__name__, calculate=metric) for metric in code_metrics],
                 prediction_data=asset_handler.prediction_data,
+                input_output_data=asset_handler.input_output_data,
                 test_data=asset_handler.test_data,
                 metrics_mapping=metrics_config,
             )
 
@@ -5,6 +5,7 @@
 import pandas as pd
 
 from ._base_handler import BaseHandler
+from ._user_agent import USER_AGENT
 from ._utils import df_to_dict_list, run_pf_flow_with_dict_list, wait_for_pf_run_to_complete
 
 
@@ -35,7 +36,9 @@ def execute_target(self):
         wait_for_pf_run_to_complete(pf_run_result.name)
 
         logger.debug("PF run results: %s", pf_run_result.properties)
-        pf_client = PFClient()
+        pf_client = PFClient(
+            user_agent=USER_AGENT
+        )
         result_df = pf_client.get_details(pf_run_result.name, all_results=True)
 
         # Rename inputs columns. E.g. inputs.question -> question
 
@@ -2,13 +2,19 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 import pandas as pd
+import logging
 
 from os import path
 from typing import Dict, Optional
 
 from azure.ai.generative.evaluate._constants import TASK_TYPE_TO_METRICS_MAPPING
+from ._user_agent import USER_AGENT
+
 from ._utils import run_pf_flow_with_dict_list, df_to_dict_list, wait_for_pf_run_to_complete
 
+LOGGER = logging.getLogger(__name__)
+
+
 class MetricHandler(object):
 
     def __init__(
@@ -49,7 +55,9 @@ def calculate_metrics(self) -> Dict:
         from promptflow import PFClient
         from promptflow.entities import AzureOpenAIConnection, OpenAIConnection
 
-        pf_client = PFClient()
+        pf_client = PFClient(
+            user_agent=USER_AGENT
+        )
 
         openai_config = self.metrics_mapping["openai_params"]
         conn_name = "openai_connection"
 
@@ -20,6 +20,7 @@ def __init__(
             task_type,
             prediction_data,
             test_data,
+            input_output_data,
             metrics_mapping=None,
             metrics=None,
     ):
@@ -30,6 +31,7 @@ def __init__(
             test_data=test_data,
             metrics_mapping=metrics_mapping,
             metrics=metrics,
+            input_output_data=input_output_data,
         )
 
         self._validate()
@@ -84,7 +86,7 @@ def _calculate_metric(self, metric, data, response):
         with ThreadPoolExecutor(thread_name_prefix="code_metrics_row") as thread_pool:
             for i in range(0, len(data)):
                 row_metric_futures.append(thread_pool.submit(
-                    self._submit_method, metric.calculate, data=data[i], response=response[i]
+                    self._submit_method, metric.calculate, data={**data[i], **response[i]}
                 ))
 
             for row_metric_future in row_metric_futures:
@@ -107,18 +109,4 @@ def _calculate_metric(self, metric, data, response):
                     {metric.name: row_metric_results}
                 )
 
-        if metric.aggregator:
-            try:
-                aggregated_values = self._submit_method(
-                    metric.aggregator,
-                    values=results.get("artifacts").get(metric.name)
-                )
-                results["metrics"].update(
-                    {
-                        f"{key}_{metric.name}": value for key, value in aggregated_values.items()
-                    }
-                )
-            except Exception as ex:
-                LOGGER.info(
-                    f"Error aggregating values for metric {metric.name} , failed with error {str(ex)} : Stack trace : {str(ex.__traceback__)}")
         return results
@@ -9,20 +9,25 @@
 import pandas as pd
 import logging
 import tqdm.asyncio
+from numpy import NaN
 
 from .._client.openai_client import AzureOpenAIClient
 from .._metric_handler import MetricHandler
 from ..metrics._custom_metric import PromptMetric
+from ..metrics._parsers import JsonParser, NumberParser
 
 LOGGER = logging.getLogger(__name__)
 
+SUPPORTED_PARSERS = [JsonParser, NumberParser]
+
 
 class PromptMetricHandler(MetricHandler):
     def __init__(
             self,
             task_type,
             prediction_data,
             test_data,
+            input_output_data,
             metrics_mapping=None,
             metrics=None,
     ):
@@ -32,6 +37,7 @@ def __init__(
             test_data=test_data,
             metrics_mapping=metrics_mapping,
             metrics=metrics,
+            input_output_data=input_output_data,
         )
 
         self._validate()
@@ -43,6 +49,7 @@ def _validate(self):
             raise Exception \
                 (f"{self.__class__.__name__} supports only {PromptMetric.__class__.__name__} type of metrics")
 
+
     def _convert_metric_to_message(self, metric, data):
         from jinja2 import Template
 
@@ -78,11 +85,32 @@ def _get_data_for_metric(self, metric):
 
         return data_as_jsonl
 
+    def _parser_response(self, value, metric):
+        result = {metric.name: NaN}
+        parsed_value = None
+
+        for parser in SUPPORTED_PARSERS:
+            parsed_value = parser.parse(value)
+            if parsed_value:
+                result = parsed_value
+                break
+
+        if parsed_value:
+            if isinstance(parsed_value, dict):
+                result = {f"{metric.name}_{key}": value for key, value in parsed_value.items()}
+            else:
+                result = {metric.name: parsed_value}
+
+        if parsed_value is None:
+            LOGGER.debug("Result from LLM should be in json format or a number")
+
+        return result
+
     async def _compute_metric_row(self, metric, data):
         message = self._convert_metric_to_message(metric, data)
         response = await self._client.bounded_chat_completion(message)
         content = self._client.get_chat_completion_content_from_response(response)
-        result = metric._parser.parse(content if content is not None else response, metric)
+        result = self._parser_response(content if content is not None else response, metric)
         return result
 
     async def _compute_metric(self, metric):
@@ -101,16 +129,6 @@ async def _compute_metric(self, metric):
                 key: [row[key] for row in responses]
             })
 
-        if metric.aggregator:
-            aggregated_values = metric.aggregator(
-                values=results.get("artifacts").get(metric.name)
-            )
-            results["metrics"].update(
-                {
-                    f"{key}_{metric.name}": value for key, value in aggregated_values.items()
-                }
-            )
-
         return results
 
     async def _compute_metrics(self, metrics):
 
@@ -0,0 +1,6 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from azure.ai.generative._version import VERSION
+
+USER_AGENT = "{}/{} {}/{}".format("azure-ai-generative", VERSION, "evaluate", VERSION)
@@ -186,3 +186,7 @@ def _copy_artifact(source, destination):
 
     pathlib.Path(destination).mkdir(exist_ok=True, parents=True)
     shutil.copy2(source, destination)
+
+
+def is_lambda_function(obj):
+    return callable(obj) and obj.__name__ == "<lambda>"
@@ -4,9 +4,8 @@
 
 __path__ = __import__("pkgutil").extend_path(__path__, __name__)  # type: ignore
 
-from ._custom_metric import CodeMetric, PromptMetric
+from ._custom_metric import PromptMetric
 
 __all__ = [
-    "CodeMetric",
     "PromptMetric"
 ]
Original file line number	Diff line number	Diff line change
`@@ -4,9 +4,8 @@`
`4`	`4`
`5`	`5`	`__path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore`
`6`	`6`
`7`		`-from ._custom_metric import CodeMetric, PromptMetric`
	`7`	`+from ._custom_metric import PromptMetric`
`8`	`8`
`9`	`9`	`__all__ = [`
`10`		`- "CodeMetric",`
`11`	`10`	`"PromptMetric"`
`12`	`11`	`]`