Azure
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_base_handler.py‎
Lines changed: 13 additions & 18 deletions b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_base_handler.py‎
Lines changed: 13 additions & 18 deletions
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_constants.py‎
Lines changed: 5 additions & 8 deletions b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_constants.py‎
Lines changed: 5 additions & 8 deletions
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_data_fetcher.py‎
Lines changed: 0 additions & 34 deletions b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_data_fetcher.py‎
Lines changed: 0 additions & 34 deletions
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_evaluate.py‎
Lines changed: 25 additions & 62 deletions b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_evaluate.py‎
Lines changed: 25 additions & 62 deletions
diff --git a/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_local_code_handler.py‎
Lines changed: 17 additions & 9 deletions b/‎sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_local_code_handler.py‎
Lines changed: 17 additions & 9 deletions
@@ -9,19 +9,18 @@ class BaseHandler(metaclass=abc.ABCMeta):
 
     def __init__(self, asset, test_data, prediction_data=None, ground_truth=None, **kwargs):
         self._prediction_data = None
+        self._input_output_data = None
         self.asset = asset
 
         test_data_df = pd.DataFrame(test_data)
+        if self.asset is None:
+            self._input_output_data = test_data_df
+            self._prediction_data = test_data_df
 
         if isinstance(prediction_data, str) and prediction_data in test_data_df.columns:
             self._prediction_data = test_data_df[[prediction_data]]
             test_data_df = test_data_df.drop(prediction_data, axis=1)
 
-        self._ground_truth = None
-        if isinstance(ground_truth, str) and ground_truth in test_data_df.columns:
-            self._ground_truth = test_data_df[[ground_truth]]
-            test_data_df = test_data_df.drop(ground_truth, axis=1)
-
         self._test_data = test_data_df
 
         self.params_dict = kwargs.pop("params_dict", None)
@@ -33,23 +32,19 @@ def test_data(self):
     @property
     def prediction_data(self):
         if self._prediction_data is None:
-            prediction_data = self.generate_prediction_data()
-            prediction_data_df = pd.DataFrame(prediction_data)
-            self._prediction_data = prediction_data_df
+            self.execute_target()
         return self._prediction_data
 
     @property
-    def ground_truth(self):
-        return self._ground_truth
+    def input_output_data(self):
+        if self._input_output_data is None:
+            self.execute_target()
+        return self._input_output_data
+
 
     @abc.abstractmethod
-    def generate_prediction_data(self):
+    def execute_target(self):
         """
-        Abstract method to generated prediction data.
+        Abstract method to generated prediction data and input output data.
         Should be implemented by all subclasses.
-        """
-
-    def get_test_data_as_jsonl(self):
-        if self.params_dict:
-            return self.test_data.assign(**self.params_dict).to_dict("records")
-        return self.test_data.to_dict("records")
+        """
@@ -2,6 +2,8 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 
+from typing import Dict, Union
+
 QA = "qa"
 CHAT = "chat"
 
@@ -12,11 +14,6 @@
     CHAT: "rag-evaluation",
 }
 
-TYPE_TO_KWARGS_MAPPING = {
-    "qa": ["questions", "contexts", "y_pred", "y_test"],
-    "rag-evaluation": ["y_pred"]
-}
-
 
 class EvaluationMetrics:
     """
@@ -71,9 +68,9 @@ class ChatMetrics:
     ]
 
 
-TASK_TYPE_TO_METRICS_MAPPING = {
-    "qa": QaMetrics,
-    "rag-evaluation": ChatMetrics
+TASK_TYPE_TO_METRICS_MAPPING: Dict[str, Union[QaMetrics, ChatMetrics]] = {
+    "qa": QaMetrics(),
+    "rag-evaluation": ChatMetrics()
 }
 
 SUPPORTED_TASK_TYPE_TO_METRICS_MAPPING = {
 
@@ -2,6 +2,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 import copy
+from hmac import new
 import json
 import os
 import shutil
@@ -27,7 +28,7 @@
 from azure.ai.generative.evaluate._utils import _is_flow, load_jsonl, _get_artifact_dir_path, _copy_artifact
 from azure.ai.generative.evaluate._mlflow_log_collector import RedirectUserOutputStreams
 from azure.ai.generative.evaluate._constants import SUPPORTED_TO_METRICS_TASK_TYPE_MAPPING, SUPPORTED_TASK_TYPE, CHAT, \
-    TYPE_TO_KWARGS_MAPPING, SUPPORTED_TASK_TYPE_TO_METRICS_MAPPING
+    SUPPORTED_TASK_TYPE_TO_METRICS_MAPPING
 from azure.ai.generative.evaluate._evaluation_result import EvaluationResult
 from ._metrics_handler._prompt_metric_handler import PromptMetricHandler
 
@@ -67,27 +68,6 @@ def _get_metric_handler_class(
     return handler
 
 
-def _validate_data(data, prediction_data, truth_data):
-    errors = []
-    prediction_data_column = ""
-    truth_data_column = ""
-
-    if isinstance(prediction_data, str):
-        prediction_data_column = data[0].get(prediction_data, None)
-
-    if isinstance(truth_data, str):
-        truth_data_column = data[0].get(truth_data, None)
-
-    if prediction_data_column is None:
-        errors.append("prediction_data column not found in data")
-
-    if truth_data_column is None:
-        errors.append("truth_data column not found in data")
-
-    if len(errors) > 1:
-        raise Exception(f'Invalid data {" ,".join(errors)}')
-
-
 def _log_metrics(run_id, metrics):
     """
     Helper method to log metrics into specified run.
@@ -135,7 +115,7 @@ def evaluate(
         task_type: Optional[str] = None,
         metrics_list: Optional[List[str]] = None,
         model_config: Optional[Dict[str, str]] = None,
-        data_mapping: Optional[Mapping] = None,
+        data_mapping: Optional[Dict[str, str]] = None,
         output_path: Optional[str] = None,
         **kwargs
 ):
@@ -154,9 +134,9 @@ def evaluate(
     :keyword metrics_list: List of metrics to calculate. A default list is picked based on task_type if not set.
     :paramtype metrics_list: Optional[List[str]]
     :keyword model_config: GPT configuration details needed for AI-assisted metrics.
-    :paramtype model_config: Dict[str, str]
+    :paramtype model_config: Optional[Dict[str, str]]
     :keyword data_mapping: GPT configuration details needed for AI-assisted metrics.
-    :paramtype data_mapping: typing.Mapping
+    :paramtype data_mapping: Optional[Dict[str, str]]
     :keyword output_path: The local folder path to save evaluation artifacts to if set
     :paramtype output_path: Optional[str]
     :keyword tracking_uri: Tracking uri to log evaluation results to AI Studio
@@ -182,8 +162,20 @@ def evaluate(
     if model_config:
         metrics_config.update({"openai_params": model_config})
 
+
     if data_mapping:
-        metrics_config.update(data_mapping)
+        import warnings
+
+        new_data_mapping = dict(data_mapping)
+        if "y_pred" in new_data_mapping:
+            warnings.warn("y_pred is deprecated, please use \"answer\" instead")
+            value = data_mapping.pop("y_pred")
+            new_data_mapping.update({"answer": value})
+        if "y_test" in new_data_mapping:
+            warnings.warn("y_test is deprecated, please use \"ground_truth\" instead")
+            value = data_mapping.pop("y_test")
+            new_data_mapping.update({"ground_truth": value})
+        data_mapping = new_data_mapping
 
     sweep_args = kwargs.pop("sweep_args", None)
     if sweep_args:
@@ -230,8 +222,6 @@ def _evaluate(
         evaluation_name=None,
         target=None,
         data=None,
-        truth_data=None,
-        prediction_data=None,
         task_type=None,
         metrics=None,
         data_mapping=None,
@@ -248,14 +238,8 @@ def _evaluate(
         test_data = data
         _data_is_file = False
 
-    if "y_pred" in data_mapping:
-        prediction_data = data_mapping.get("y_pred")
-
-    if "y_test" in data_mapping:
-        truth_data = data_mapping.get("y_test")
-
-    if target is None and prediction_data is None:
-        raise Exception("target and prediction data cannot be null")
+    if "answer" in data_mapping:
+        prediction_data = data_mapping.get("answer")
 
     if task_type not in SUPPORTED_TASK_TYPE:
         raise Exception(f"task type {task_type} is not supported")
@@ -281,8 +265,6 @@ def _evaluate(
 
         asset_handler = asset_handler_class(
             asset=target,
-            prediction_data=prediction_data,
-            ground_truth=truth_data,
             test_data=test_data,
             metrics_config=metrics_config,
             **kwargs
@@ -299,8 +281,6 @@ def _evaluate(
         custom_prompt_metrics = [metric for metric in metrics if isinstance(metric, PromptMetric)]
         code_metrics = [metric for metric in metrics if isinstance(metric, CodeMetric)]
 
-        # TODO : Once PF is used for inbuilt metrics parallelize submission of metrics calculation of different kind
-
         if custom_prompt_metrics:
             for metric in custom_prompt_metrics:
                 metrics_config.setdefault(metric.name, {param: param for param in metric.parameters})
@@ -309,12 +289,8 @@ def _evaluate(
                 task_type="custom-prompt-metric",
                 metrics=custom_prompt_metrics,
                 prediction_data=asset_handler.prediction_data,
-                truth_data=asset_handler.ground_truth,
                 test_data=asset_handler.test_data,
                 metrics_mapping=metrics_config,
-                prediction_data_column_name=prediction_data if isinstance(prediction_data, str) else None,
-                ground_truth_column_name=truth_data if isinstance(truth_data, str) else None,
-                type_to_kwargs="custom-prompt-metric"
             )
 
             prompt_metric_results = prompt_metric_handler.calculate_metrics()
@@ -328,12 +304,8 @@ def _evaluate(
                 task_type="custom-code-metric",
                 metrics=code_metrics,
                 prediction_data=asset_handler.prediction_data,
-                truth_data=asset_handler.ground_truth,
                 test_data=asset_handler.test_data,
                 metrics_mapping=metrics_config,
-                prediction_data_column_name=prediction_data if isinstance(prediction_data, str) else None,
-                ground_truth_column_name=truth_data if isinstance(truth_data, str) else None,
-                type_to_kwargs="code-prompt-metric"
             )
 
             code_metric_results = code_metric_handler.calculate_metrics()
@@ -347,12 +319,10 @@ def _evaluate(
                 task_type=SUPPORTED_TO_METRICS_TASK_TYPE_MAPPING[task_type],
                 metrics=inbuilt_metrics,
                 prediction_data=asset_handler.prediction_data,
-                truth_data=asset_handler.ground_truth,
+                input_output_data=asset_handler.input_output_data,
                 test_data=asset_handler.test_data,
                 metrics_mapping=metrics_config,
-                prediction_data_column_name=prediction_data if isinstance(prediction_data, str) else None,
-                ground_truth_column_name=truth_data if isinstance(truth_data, str) else None,
-                type_to_kwargs=TYPE_TO_KWARGS_MAPPING[task_type]
+                data_mapping=data_mapping,
             )
 
             inbuilt_metrics_results = inbuilt_metrics_handler.calculate_metrics()
@@ -393,6 +363,7 @@ def _evaluate(
             eval_artifact_df = _get_instance_table(metrics_results, task_type, asset_handler).to_json(orient="records",
                                                                                                       lines=True,
                                                                                                       force_ascii=False)
+            # eval_artifact_df = result.to_json(orient="records", lines=True, force_ascii=False)
             tmp_path = os.path.join(tmpdir, "eval_results.jsonl")
 
             with open(tmp_path, "w", encoding="utf-8") as f:
@@ -480,22 +451,14 @@ def _get_chat_instance_table(metrics):
 
 
 def _get_instance_table(metrics, task_type, asset_handler):
-    if metrics.get("artifacts"):
-        metrics.get("artifacts").pop("bertscore", None)
+
     if task_type == CHAT:
         instance_level_metrics_table = _get_chat_instance_table(metrics.get("artifacts"))
     else:
         instance_level_metrics_table = pd.DataFrame(metrics.get("artifacts"))
 
-    prediction_data = asset_handler.prediction_data
-    for column in asset_handler.prediction_data.columns.values:
-        if column in asset_handler.test_data.columns.values:
-            prediction_data.drop(column, axis=1, inplace=True)
-
     combined_table = pd.concat(
-        [asset_handler.test_data,
-         prediction_data,
-         asset_handler.ground_truth,
+        [asset_handler.input_output_data,
          instance_level_metrics_table
          ],
         axis=1,
 
@@ -3,8 +3,10 @@
 # ---------------------------------------------------------
 
 import logging
+import pandas as pd
 
 from azure.ai.generative.evaluate._base_handler import BaseHandler
+from ._utils import df_to_dict_list
 
 logger = logging.getLogger(__name__)
 
@@ -22,21 +24,27 @@ def __init__(self, asset, test_data, prediction_data=None, ground_truth=None, **
             **kwargs
         )
 
-    def generate_prediction_data(self):
-        # TODO: Check if this is the right place for this logic
+    def execute_target(self):
         prediction_data = []
-        test_data = self.get_test_data_as_jsonl()
+        input_output_data = []
+        test_data = df_to_dict_list(self.test_data, self.params_dict)
 
         import inspect
         is_asset_async = False
         if inspect.iscoroutinefunction(self.asset):
             is_asset_async = True
             import asyncio
 
-        for d in test_data:
-            prediction_data.append(
-                asyncio.run(self.asset(**d)) if is_asset_async else self.asset(**d)
-            )
+        for input in test_data:
+            # The assumption here is target function returns a dict with output keys
+            fn_output = asyncio.run(self.asset(**input)) if is_asset_async else self.asset(**input) 
+            
+            prediction_data.append(fn_output)
+            # When input and output have a common key, value in output overrides value in input
+            input_output = dict(input)
+            input_output.update(fn_output)
+            input_output_data.append(input_output)
 
-        
-        return prediction_data
+
+        self._prediction_data = pd.DataFrame(prediction_data)
+        self._input_output_data = pd.DataFrame(input_output_data)