Azure
diff --git a/‎.vscode/cspell.json‎
Lines changed: 7 additions & 0 deletions b/‎.vscode/cspell.json‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎sdk/ai/azure-ai-generative/MANIFEST.in‎
Lines changed: 1 addition & 0 deletions b/‎sdk/ai/azure-ai-generative/MANIFEST.in‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_client/__init__.py‎ b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_client/__init__.py‎
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_client/openai_client.py‎
Lines changed: 56 additions & 0 deletions b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_client/openai_client.py‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_constants.py‎
Lines changed: 5 additions & 0 deletions b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_constants.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_evaluate.py‎
Lines changed: 144 additions & 38 deletions b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_evaluate.py‎
Lines changed: 144 additions & 38 deletions
@@ -1246,6 +1246,13 @@
         "smirnov"
       ]
     },
+    {
+      "filename": "sdk/ai/azure-ai-generative/**",
+      "words": [
+        "tqdm",
+        "genai"
+      ]
+    },
     {
       "filename": "sdk/attestation/azure-security-attestation/tests/conftest.py",
       "words":[
 
@@ -5,5 +5,6 @@ include azure/__init__.py
 include azure/ai/__init__.py
 include azure/ai/generative/py.typed
 include azure/ai/generative/index/_utils/encodings/*
+include azure/ai/generative/evaluate/metrics/templates/*
 recursive-include azure/ai/generative/synthetic/templates *.txt
 recursive-include azure/ai/generative/synthetic/simulator/templates *.md
@@ -0,0 +1,56 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+import asyncio
+import logging
+
+from openai import AsyncAzureOpenAI
+from openai.types.chat.chat_completion import ChatCompletion
+
+from azure.ai.generative._user_agent import USER_AGENT
+from azure.ai.generative.constants._common import USER_AGENT_HEADER_KEY
+
+semaphore = asyncio.Semaphore(10)
+
+LOGGER = logging.getLogger(__name__)
+
+
+class AzureOpenAIClient:
+
+    def __init__(self, openai_params):
+        self._azure_endpoint = openai_params.get("azure_endpoint", None) if openai_params.get("azure_endpoint", None) \
+            else openai_params.get("api_base", None)
+        self._api_key = openai_params.get("api_key", None)
+        self._api_version = openai_params.get("api_version", None)
+        self._azure_deployment = openai_params.get("azure_deployment", None)\
+            if openai_params.get("azure_deployment", None) else openai_params.get("deployment_id", None)
+
+        self._client = AsyncAzureOpenAI(
+            azure_endpoint=self._azure_endpoint,
+            api_version=self._api_version,
+            api_key=self._api_key,
+            default_headers={
+                USER_AGENT_HEADER_KEY: USER_AGENT,
+                "client_operation_source": "evaluate"
+            },
+        )
+
+    async def bounded_chat_completion(self, messages):
+        async with semaphore:
+            try:
+                result = await self._client.with_options(max_retries=5).chat.completions.create(
+                    model=self._azure_deployment,
+                    messages=messages,
+                    temperature=0,
+                    seed=0,
+                )
+                return result
+            except Exception as ex:
+                LOGGER.debug(f"Failed to call llm with exception :  {str(ex)}")
+                return ex
+
+    @staticmethod
+    def get_chat_completion_content_from_response(response):
+        if isinstance(response, ChatCompletion):
+            return response.choices[0].message.content
+        return None
@@ -75,3 +75,8 @@ class ChatMetrics:
     "qa": QaMetrics,
     "rag-evaluation": ChatMetrics
 }
+
+SUPPORTED_TASK_TYPE_TO_METRICS_MAPPING = {
+    QA: QaMetrics,
+    CHAT: ChatMetrics
+}
@@ -1,16 +1,19 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+import copy
 import json
 import os
 import shutil
 import tempfile
 import time
 import logging
+from json import JSONDecodeError
 from pathlib import Path
 from typing import Callable, Optional, Dict, List, Mapping
 
 import mlflow
+import numpy as np
 import pandas as pd
 from azure.core.tracing.decorator import distributed_trace
 from azure.ai.generative._telemetry import ActivityType, monitor_with_activity, monitor_with_telemetry_mixin, ActivityLogger
@@ -20,12 +23,16 @@
 from mlflow.protos.databricks_pb2 import ErrorCode, INVALID_PARAMETER_VALUE
 
 from azure.ai.generative.evaluate._metric_handler import MetricHandler
+from azure.ai.generative.evaluate._metrics_handler._code_metric_handler import CodeMetricHandler
 from azure.ai.generative.evaluate._utils import _is_flow, load_jsonl, _get_artifact_dir_path, _copy_artifact
 from azure.ai.generative.evaluate._mlflow_log_collector import RedirectUserOutputStreams
-from azure.ai.generative.evaluate._constants import SUPPORTED_TO_METRICS_TASK_TYPE_MAPPING, SUPPORTED_TASK_TYPE, CHAT
+from azure.ai.generative.evaluate._constants import SUPPORTED_TO_METRICS_TASK_TYPE_MAPPING, SUPPORTED_TASK_TYPE, CHAT, \
+    TYPE_TO_KWARGS_MAPPING, SUPPORTED_TASK_TYPE_TO_METRICS_MAPPING
 from azure.ai.generative.evaluate._evaluation_result import EvaluationResult
+from ._metrics_handler._prompt_metric_handler import PromptMetricHandler
 
 from ._utils import _write_properties_to_run_history
+from .metrics._custom_metric import CodeMetric, PromptMetric, Metric as GenAIMetric
 
 LOGGER = logging.getLogger(__name__)
 
@@ -47,6 +54,19 @@ def _get_handler_class(
     return handler
 
 
+def _get_metric_handler_class(
+        asset,
+):
+    if _is_flow(asset):
+        from azure.ai.generative.evaluate._local_flow_handler import LocalFlowHandler
+        handler = LocalFlowHandler
+    else:
+        from azure.ai.generative.evaluate._local_code_handler import LocalCodeHandler
+        handler = LocalCodeHandler
+
+    return handler
+
+
 def _validate_data(data, prediction_data, truth_data):
     errors = []
     prediction_data_column = ""
@@ -83,6 +103,28 @@ def _log_metrics(run_id, metrics):
     )
 
 
+def _validate_metrics(metrics, task_type):
+    genai_metrics = []
+    builtin_metrics =[]
+    unknown_metrics = []
+
+    for metric in metrics:
+        if isinstance(metric, GenAIMetric):
+            genai_metrics.append(metric.name)
+        elif isinstance(metric, str) and metric in SUPPORTED_TASK_TYPE_TO_METRICS_MAPPING[task_type].SUPPORTED_LIST:
+            builtin_metrics.append(metric)
+        else:
+            unknown_metrics.append(metric)
+
+    if len(unknown_metrics) > 0:
+        raise Exception("Unsupported metric found in the list")
+
+    # if len(set(genai_metrics) & set(builtin_metrics)) > 0:
+    if len(genai_metrics) != len(set(genai_metrics)) or len(builtin_metrics) != len(set(builtin_metrics))\
+            or (len(set(genai_metrics) & set(builtin_metrics)) > 0):
+        raise Exception("Duplicate metric name found. Metric names should be unique")
+
+
 @distributed_trace
 @monitor_with_activity(package_logger, "Evaluate", ActivityType.PUBLICAPI)
 def evaluate(
@@ -223,7 +265,7 @@ def _evaluate(
         metrics_config.update({"openai_params": model_config})
 
     if data_mapping:
-        metrics_config.update(data_mapping)
+        metrics_config.update({"data_mapping": data_mapping})
 
     with mlflow.start_run(nested=True if mlflow.active_run() else False, run_name=evaluation_name) as run, \
             RedirectUserOutputStreams(logger=LOGGER) as _:
@@ -246,43 +288,81 @@ def _evaluate(
             **kwargs
         )
 
-        metrics_handler = MetricHandler(
-            task_type=SUPPORTED_TO_METRICS_TASK_TYPE_MAPPING[task_type],
-            metrics=metrics,
-            prediction_data=asset_handler.prediction_data,
-            truth_data=asset_handler.ground_truth,
-            test_data=asset_handler.test_data,
-            metrics_mapping=metrics_config,
-            prediction_data_column_name=prediction_data if isinstance(prediction_data, str) else None,
-            ground_truth_column_name=truth_data if isinstance(truth_data, str) else None,
-        )
+        metrics_results = {"artifacts": {}, "metrics": {}}
 
-        metrics = metrics_handler.calculate_metrics()
+        if metrics is None:
+            metrics = SUPPORTED_TASK_TYPE_TO_METRICS_MAPPING[task_type].DEFAULT_LIST
 
-        def _get_instance_table():
-            metrics.get("artifacts").pop("bertscore", None)
-            if task_type == CHAT:
-                instance_level_metrics_table = _get_chat_instance_table(metrics.get("artifacts"))
-            else:
-                instance_level_metrics_table = pd.DataFrame(metrics.get("artifacts"))
-
-            prediction_data = asset_handler.prediction_data
-            for column in asset_handler.prediction_data.columns.values:
-                if column in asset_handler.test_data.columns.values:
-                    prediction_data.drop(column, axis=1, inplace=True)
-
-            combined_table = pd.concat(
-                [asset_handler.test_data,
-                 prediction_data,
-                 asset_handler.ground_truth,
-                 instance_level_metrics_table
-                 ],
-                axis=1,
-                verify_integrity=True
+        _validate_metrics(metrics, task_type)
+
+        inbuilt_metrics = [metric for metric in metrics if not isinstance(metric, GenAIMetric)]
+        custom_prompt_metrics = [metric for metric in metrics if isinstance(metric, PromptMetric)]
+        code_metrics = [metric for metric in metrics if isinstance(metric, CodeMetric)]
+
+        # TODO : Once PF is used for inbuilt metrics parallelize submission of metrics calculation of different kind
+
+        if custom_prompt_metrics:
+            for metric in custom_prompt_metrics:
+                metrics_config.setdefault(metric.name, {param: param for param in metric.parameters})
+
+            prompt_metric_handler = PromptMetricHandler(
+                task_type="custom-prompt-metric",
+                metrics=custom_prompt_metrics,
+                prediction_data=asset_handler.prediction_data,
+                truth_data=asset_handler.ground_truth,
+                test_data=asset_handler.test_data,
+                metrics_mapping=metrics_config,
+                prediction_data_column_name=prediction_data if isinstance(prediction_data, str) else None,
+                ground_truth_column_name=truth_data if isinstance(truth_data, str) else None,
+                type_to_kwargs="custom-prompt-metric"
             )
-            return combined_table
 
-        _log_metrics(run_id=run.info.run_id, metrics=metrics.get("metrics"))
+            prompt_metric_results = prompt_metric_handler.calculate_metrics()
+
+            if prompt_metric_results is not None:
+                for k, v in metrics_results.items():
+                    v.update(prompt_metric_results[k])
+
+        if code_metrics:
+            code_metric_handler = CodeMetricHandler(
+                task_type="custom-code-metric",
+                metrics=code_metrics,
+                prediction_data=asset_handler.prediction_data,
+                truth_data=asset_handler.ground_truth,
+                test_data=asset_handler.test_data,
+                metrics_mapping=metrics_config,
+                prediction_data_column_name=prediction_data if isinstance(prediction_data, str) else None,
+                ground_truth_column_name=truth_data if isinstance(truth_data, str) else None,
+                type_to_kwargs="code-prompt-metric"
+            )
+
+            code_metric_results = code_metric_handler.calculate_metrics()
+
+            if code_metric_results is not None:
+                for k, v in metrics_results.items():
+                    v.update(code_metric_results[k])
+
+        if inbuilt_metrics:
+            inbuilt_metrics_handler = MetricHandler(
+                task_type=SUPPORTED_TO_METRICS_TASK_TYPE_MAPPING[task_type],
+                metrics=inbuilt_metrics,
+                prediction_data=asset_handler.prediction_data,
+                truth_data=asset_handler.ground_truth,
+                test_data=asset_handler.test_data,
+                metrics_mapping=metrics_config,
+                prediction_data_column_name=prediction_data if isinstance(prediction_data, str) else None,
+                ground_truth_column_name=truth_data if isinstance(truth_data, str) else None,
+                type_to_kwargs=TYPE_TO_KWARGS_MAPPING[task_type]
+            )
+
+            inbuilt_metrics_results = inbuilt_metrics_handler.calculate_metrics()
+
+            if inbuilt_metrics_results is not None:
+                for k, v in metrics_results.items():
+                    v.update(inbuilt_metrics_results[k])
+
+        if metrics_results.get("metrics"):
+            _log_metrics(run_id=run.info.run_id, metrics=metrics_results.get("metrics"))
 
         with tempfile.TemporaryDirectory() as tmpdir:
             for param_name, param_value in kwargs.get("params_dict", {}).items():
@@ -310,7 +390,9 @@ def _get_instance_table():
                     else:
                         raise ex
 
-            eval_artifact_df = _get_instance_table().to_json(orient="records", lines=True, force_ascii=False)
+            eval_artifact_df = _get_instance_table(metrics_results, task_type, asset_handler).to_json(orient="records",
+                                                                                                      lines=True,
+                                                                                                      force_ascii=False)
             tmp_path = os.path.join(tmpdir, "eval_results.jsonl")
 
             with open(tmp_path, "w", encoding="utf-8") as f:
@@ -322,13 +404,12 @@ def _get_instance_table():
             mlflow.log_param("task_type", task_type)
             if task_type == CHAT:
                 log_property("_azureml.chat_history_column", data_mapping.get("y_pred"))
-            # log_param_and_tag("_azureml.evaluate_metric_mapping", json.dumps(metrics_handler._metrics_mapping_to_log))
 
             if output_path:
                 _copy_artifact(tmp_path, output_path)
 
     evaluation_result = EvaluationResult(
-        metrics_summary=metrics.get("metrics"),
+        metrics_summary=metrics_results.get("metrics"),
         artifacts={
             "eval_results.jsonl": f"runs:/{run.info.run_id}/eval_results.jsonl"
         },
@@ -396,3 +477,28 @@ def _get_chat_instance_table(metrics):
 
     instance_level_metrics_table = pd.DataFrame(instance_table_metrics_dict)
     return instance_level_metrics_table
+
+
+def _get_instance_table(metrics, task_type, asset_handler):
+    if metrics.get("artifacts"):
+        metrics.get("artifacts").pop("bertscore", None)
+    if task_type == CHAT:
+        instance_level_metrics_table = _get_chat_instance_table(metrics.get("artifacts"))
+    else:
+        instance_level_metrics_table = pd.DataFrame(metrics.get("artifacts"))
+
+    prediction_data = asset_handler.prediction_data
+    for column in asset_handler.prediction_data.columns.values:
+        if column in asset_handler.test_data.columns.values:
+            prediction_data.drop(column, axis=1, inplace=True)
+
+    combined_table = pd.concat(
+        [asset_handler.test_data,
+         prediction_data,
+         asset_handler.ground_truth,
+         instance_level_metrics_table
+         ],
+        axis=1,
+        verify_integrity=True
+    )
+    return combined_table
Original file line number	Diff line number	Diff line change
`@@ -1246,6 +1246,13 @@`
`1246`	`1246`	`"smirnov"`
`1247`	`1247`	`]`
`1248`	`1248`	`},`
	`1249`	`+ {`
	`1250`	`+ "filename": "sdk/ai/azure-ai-generative/**",`
	`1251`	`+ "words": [`
	`1252`	`+ "tqdm",`
	`1253`	`+ "genai"`
	`1254`	`+ ]`
	`1255`	`+ },`
`1249`	`1256`	`{`
`1250`	`1257`	`"filename": "sdk/attestation/azure-security-attestation/tests/conftest.py",`
`1251`	`1258`	`"words":[`
Original file line number	Diff line number	Diff line change
`@@ -75,3 +75,8 @@ class ChatMetrics:`
`75`	`75`	`"qa": QaMetrics,`
`76`	`76`	`"rag-evaluation": ChatMetrics`
`77`	`77`	`}`
	`78`	`+`
	`79`	`+SUPPORTED_TASK_TYPE_TO_METRICS_MAPPING = {`
	`80`	`+ QA: QaMetrics,`
	`81`	`+ CHAT: ChatMetrics`
	`82`	`+}`