Nest column_mapping in evaluator_config (Azure#37551)

needuv · web-flow · commit 2baa02aea791 · 2024-09-26T18:33:53.000Z
* evaluator config

* add a change log entry

* fix changelog wording

* fix another thing in changelog

* fix typing issue

* add to __init__

* fix some tests

* fix a spell check issue

* fix tests again

* fix last failing test

* clean up changelog
diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
@@ -8,6 +8,39 @@
 
 ### Breaking Changes
 
+- The `evaluator_config` parameter of `evaluate` now maps in evaluator name to a dictionary `EvaluatorConfig`, which is a `TypedDict`. The
+`column_mapping` between `data` or `target` and evaluator field names should now be specified inside this new dictionary:
+
+Before:
+```python
+evaluate(
+    ...,
+    evaluator_config={
+        "hate_unfairness": {
+            "query": "${data.question}",
+            "response": "${data.answer}",
+        }
+    },
+    ...
+)
+```
+
+After
+```python
+evaluate(
+    ...,
+    evaluator_config={
+        "hate_unfairness": {
+            "column_mapping": {
+                "query": "${data.question}",
+                "response": "${data.answer}",
+             }
+        }
+    },
+    ...
+)
+```
+
 ### Bugs Fixed
 
 ### Other Changes
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py
@@ -25,7 +25,12 @@
 from ._evaluators._rouge import RougeScoreEvaluator, RougeType
 from ._evaluators._similarity import SimilarityEvaluator
 from ._evaluators._xpia import IndirectAttackEvaluator
-from ._model_configurations import AzureAIProject, AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+from ._model_configurations import (
+    AzureAIProject,
+    AzureOpenAIModelConfiguration,
+    OpenAIModelConfiguration,
+    EvaluatorConfig,
+)
 
 __all__ = [
     "evaluate",
@@ -53,4 +58,5 @@
     "AzureAIProject",
     "AzureOpenAIModelConfiguration",
     "OpenAIModelConfiguration",
+    "EvaluatorConfig",
 ]
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
@@ -19,7 +19,7 @@
     Prefixes,
     _InternalEvaluationMetrics,
 )
-from .._model_configurations import AzureAIProject
+from .._model_configurations import AzureAIProject, EvaluatorConfig
 from .._user_agent import USER_AGENT
 from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
 from ._utils import (
@@ -273,7 +273,7 @@ def _validate_columns(
     df: pd.DataFrame,
     evaluators: Dict[str, Any],
     target: Optional[Callable],
-    evaluator_config: Dict[str, Dict[str, str]],
+    column_mapping: Dict[str, Dict[str, str]],
 ) -> None:
     """
     Check that all columns needed by evaluator or target function are present.
@@ -284,8 +284,8 @@ def _validate_columns(
     :type evaluators: Dict[str, Any]
     :param target: The callable to be applied to data set.
     :type target: Optional[Callable]
-    :param evaluator_config: The configuration for evaluators.
-    :type evaluator_config: Dict[str, Dict[str, str]]
+    :param column_mapping: Dictionary mapping evaluator name to evaluator column mapping
+    :type column_mapping: Dict[str, Dict[str, str]]
     :raises EvaluationException: If column starts from "__outputs." while target is defined.
     """
     if target:
@@ -306,7 +306,7 @@ def _validate_columns(
     else:
         for evaluator_name, evaluator in evaluators.items():
             # Apply column mapping
-            mapping_config = evaluator_config.get(evaluator_name, evaluator_config.get("default", None))
+            mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
             new_df = _apply_column_mapping(df, mapping_config)
 
             # Validate input data for evaluator
@@ -372,11 +372,11 @@ def _apply_target_to_data(
     return target_output, generated_columns, run
 
 
-def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
-    """Process evaluator_config to replace ${target.} with ${data.}
+def _process_column_mappings(column_mapping: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
+    """Process column_mapping to replace ${target.} with ${data.}
 
-    :param evaluator_config: The configuration for evaluators.
-    :type evaluator_config: Dict[str, Dict[str, str]]
+    :param column_mapping: The configuration for evaluators.
+    :type column_mapping: Dict[str, Dict[str, str]]
     :return: The processed configuration.
     :rtype: Dict[str, Dict[str, str]]
     """
@@ -385,15 +385,15 @@ def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]) -> Di
 
     unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
 
-    if evaluator_config:
-        for evaluator, mapping_config in evaluator_config.items():
+    if column_mapping:
+        for evaluator, mapping_config in column_mapping.items():
             if isinstance(mapping_config, dict):
                 processed_config[evaluator] = {}
 
                 for map_to_key, map_value in mapping_config.items():
                     # Check if there's any unexpected reference other than ${target.} or ${data.}
                     if unexpected_references.search(map_value):
-                        msg = "Unexpected references detected in 'evaluator_config'. Ensure only ${target.} and ${data.} are used."
+                        msg = "Unexpected references detected in 'column_mapping'. Ensure only ${target.} and ${data.} are used."
                         raise EvaluationException(
                             message=msg,
                             internal_message=msg,
@@ -439,7 +439,7 @@ def evaluate(
     evaluators: Dict[str, Callable],
     evaluation_name: Optional[str] = None,
     target: Optional[Callable] = None,
-    evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
+    evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
     azure_ai_project: Optional[AzureAIProject] = None,
     output_path: Optional[str] = None,
     **kwargs,
@@ -458,10 +458,10 @@ def evaluate(
     :keyword target: Target to be evaluated. `target` and `data` both cannot be None
     :paramtype target: Optional[Callable]
     :keyword evaluator_config: Configuration for evaluators. The configuration should be a dictionary with evaluator
-        names as keys and a dictionary of column mappings as values. The column mappings should be a dictionary with
-        keys as the column names in the evaluator input and values as the column names in the input data or data
-        generated by target.
-    :paramtype evaluator_config: Optional[Dict[str, Dict[str, str]]
+        names as keys and a values that are dictionaries containing the column mappings. The column mappings should
+        be a dictionary with keys as the column names in the evaluator input and values as the column names in the
+        input data or data generated by target.
+    :paramtype evaluator_config: Optional[Dict[str, ~azure.ai.evaluation.EvaluatorConfig]]
     :keyword output_path: The local folder or file path to save evaluation results to if set. If folder path is provided
           the results will be saved to a file named `evaluation_results.json` in the folder.
     :paramtype output_path: Optional[str]
@@ -482,7 +482,7 @@ def evaluate(
             model_config = {
                 "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
                 "api_key": os.environ.get("AZURE_OPENAI_KEY"),
-                "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT")
+                "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
             }
 
             coherence_eval = CoherenceEvaluator(model_config=model_config)
@@ -497,15 +497,19 @@ def evaluate(
                 },
                 evaluator_config={
                     "coherence": {
-                        "response": "${data.response}",
-                        "query": "${data.query}"
+                        "column_mapping": {
+                            "response": "${data.response}",
+                            "query": "${data.query}",
+                        },
                     },
                     "relevance": {
-                        "response": "${data.response}",
-                        "context": "${data.context}",
-                        "query": "${data.query}"
-                    }
-                }
+                        "column_mapping": {
+                            "response": "${data.response}",
+                            "context": "${data.context}",
+                            "query": "${data.query}",
+                        },
+                    },
+                },
             )
 
     """
@@ -544,13 +548,13 @@ def evaluate(
         raise e
 
 
-def _evaluate(  # pylint: disable=too-many-locals
+def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     *,
     evaluation_name: Optional[str] = None,
     target: Optional[Callable] = None,
     data: Optional[str] = None,
     evaluators: Optional[Dict[str, Callable]] = None,
-    evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
+    evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
     azure_ai_project: Optional[AzureAIProject] = None,
     output_path: Optional[str] = None,
     **kwargs,
@@ -560,8 +564,13 @@ def _evaluate(  # pylint: disable=too-many-locals
     # Process evaluator config to replace ${target.} with ${data.}
     if evaluator_config is None:
         evaluator_config = {}
-    evaluator_config = _process_evaluator_config(evaluator_config)
-    _validate_columns(input_data_df, evaluators, target, evaluator_config)
+    # extract column mapping dicts into dictionary mapping evaluator name to column mapping
+    column_mapping = {
+        evaluator_name: evaluator_configuration.get("column_mapping", None)
+        for evaluator_name, evaluator_configuration in evaluator_config.items()
+    }
+    column_mapping = _process_column_mappings(column_mapping)
+    _validate_columns(input_data_df, evaluators, target, column_mapping)
 
     # Target Run
     pf_client = PFClient(
@@ -577,30 +586,30 @@ def _evaluate(  # pylint: disable=too-many-locals
 
     # Create default configuration for evaluators that directly maps
     # input data names to keyword inputs of the same name in the evaluators.
-    evaluator_config = evaluator_config or {}
-    evaluator_config.setdefault("default", {})
+    column_mapping = column_mapping or {}
+    column_mapping.setdefault("default", {})
 
     # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
     if data is not None and target is not None:
         input_data_df, target_generated_columns, target_run = _apply_target_to_data(
             target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
         )
 
-        for evaluator_name, mapping in evaluator_config.items():
+        for evaluator_name, mapping in column_mapping.items():
             mapped_to_values = set(mapping.values())
             for col in target_generated_columns:
                 # If user defined mapping differently, do not change it.
                 # If it was mapped to target, we have already changed it
-                # in _process_evaluator_config
+                # in _process_column_mappings
                 run_output = f"${{run.outputs.{col}}}"
                 # We will add our mapping only if
                 # customer did not mapped target output.
                 if col not in mapping and run_output not in mapped_to_values:
-                    evaluator_config[evaluator_name][col] = run_output  # pylint: disable=unnecessary-dict-index-lookup
+                    column_mapping[evaluator_name][col] = run_output  # pylint: disable=unnecessary-dict-index-lookup
 
         # After we have generated all columns we can check if we have
         # everything we need for evaluators.
-        _validate_columns(input_data_df, evaluators, target=None, evaluator_config=evaluator_config)
+        _validate_columns(input_data_df, evaluators, target=None, column_mapping=column_mapping)
 
     # Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
     # via target mapping.
@@ -610,8 +619,8 @@ def _evaluate(  # pylint: disable=too-many-locals
         for col in input_data_df.columns:
             # Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
             # Also ignore columns that are already in config, since they've been covered by target mapping.
-            if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in evaluator_config["default"].keys():
-                evaluator_config["default"][col] = f"${{data.{col}}}"
+            if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
+                column_mapping["default"][col] = f"${{data.{col}}}"
     # Batch Run
     evaluators_info = {}
     use_pf_client = kwargs.get("_use_pf_client", True)
@@ -632,7 +641,7 @@ def _evaluate(  # pylint: disable=too-many-locals
                 flow=evaluator,
                 run=target_run,
                 evaluator_name=evaluator_name,
-                column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
+                column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
                 data=data,
                 stream=True,
                 name=kwargs.get("_run_name"),
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_model_configurations.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_model_configurations.py
@@ -2,7 +2,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 
-from typing import Literal, TypedDict
+from typing import Dict, Literal, TypedDict
 
 from typing_extensions import NotRequired
 
@@ -46,3 +46,10 @@ class AzureAIProject(TypedDict):
     """Azure resource group name of the project"""
     project_name: str
     """Azure project name"""
+
+
+class EvaluatorConfig(TypedDict, total=False):
+    """Configuration for an evaluator"""
+
+    column_mapping: Dict[str, str]
+    """Dictionary mapping evaluator input name to column in data"""
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py
@@ -292,12 +292,12 @@ def test_evaluate_with_target(self, questions_file):
             None,
             {"default": {}},
             {"default": {}, "question_ev": {}},
-            {"default": {"query": "${target.query}"}},
-            {"default": {"query": "${data.query}"}},
-            {"default": {}, "question_ev": {"query": "${data.query}"}},
-            {"default": {}, "question_ev": {"query": "${target.query}"}},
-            {"default": {}, "question_ev": {"another_question": "${target.query}"}},
-            {"default": {"another_question": "${target.query}"}},
+            {"default": {"column_mapping": {"query": "${target.query}"}}},
+            {"default": {"column_mapping": {"query": "${data.query}"}}},
+            {"default": {}, "question_ev": {"column_mapping": {"query": "${data.query}"}}},
+            {"default": {}, "question_ev": {"column_mapping": {"query": "${target.query}"}}},
+            {"default": {}, "question_ev": {"column_mapping": {"another_question": "${target.query}"}}},
+            {"default": {"column_mapping": {"another_question": "${target.query}"}}},
         ],
     )
     def test_evaluate_another_questions(self, questions_file, evaluation_config):
@@ -334,19 +334,25 @@ def test_evaluate_another_questions(self, questions_file, evaluation_config):
             (
                 {
                     "f1_score": {
-                        "response": "${data.context}",
-                        "ground_truth": "${data.ground_truth}",
+                        "column_mapping": {
+                            "response": "${data.context}",
+                            "ground_truth": "${data.ground_truth}",
+                        }
                     },
                     "answer": {
-                        "response": "${target.response}",
+                        "column_mapping": {
+                            "response": "${target.response}",
+                        }
                     },
                 }
             ),
             (
                 {
                     "default": {
-                        "response": "${target.response}",
-                        "ground_truth": "${data.ground_truth}",
+                        "column_mapping": {
+                            "response": "${target.response}",
+                            "ground_truth": "${data.ground_truth}",
+                        }
                     },
                 }
             ),
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
@@ -330,11 +330,11 @@ def test_evaluate_invalid_evaluator_config(self, mock_model_config, evaluate_tes
             evaluate(
                 data=evaluate_test_data_jsonl_file,
                 evaluators={"g": GroundednessEvaluator(model_config=mock_model_config)},
-                evaluator_config={"g": {"query": "${foo.query}"}},
+                evaluator_config={"g": {"column_mapping": {"query": "${foo.query}"}}},
             )
 
         assert (
-            "Unexpected references detected in 'evaluator_config'. Ensure only ${target.} and ${data.} are used."
+            "Unexpected references detected in 'column_mapping'. Ensure only ${target.} and ${data.} are used."
             in exc_info.value.args[0]
         )
 

Original file line number	Diff line number	Diff line change
`@@ -330,11 +330,11 @@ def test_evaluate_invalid_evaluator_config(self, mock_model_config, evaluate_tes`
`330`	`330`	`evaluate(`
`331`	`331`	`data=evaluate_test_data_jsonl_file,`
`332`	`332`	`evaluators={"g": GroundednessEvaluator(model_config=mock_model_config)},`
`333`		`- evaluator_config={"g": {"query": "${foo.query}"}},`
	`333`	`+ evaluator_config={"g": {"column_mapping": {"query": "${foo.query}"}}},`
`334`	`334`	`)`
`335`	`335`
`336`	`336`	`assert (`
`337`		`- "Unexpected references detected in 'evaluator_config'. Ensure only ${target.} and ${data.} are used."`
	`337`	`+ "Unexpected references detected in 'column_mapping'. Ensure only ${target.} and ${data.} are used."`
`338`	`338`	`in exc_info.value.args[0]`
`339`	`339`	`)`
`340`	`340`