Skip to content

Commit 2baa02a

Browse files
authored
Nest column_mapping in evaluator_config (Azure#37551)
* evaluator config * add a change log entry * fix changelog wording * fix another thing in changelog * fix typing issue * add to __init__ * fix some tests * fix a spell check issue * fix tests again * fix last failing test * clean up changelog
1 parent c92ee71 commit 2baa02a

File tree

6 files changed

+114
-53
lines changed

6 files changed

+114
-53
lines changed

sdk/evaluation/azure-ai-evaluation/CHANGELOG.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,39 @@
88

99
### Breaking Changes
1010

11+
- The `evaluator_config` parameter of `evaluate` now maps in evaluator name to a dictionary `EvaluatorConfig`, which is a `TypedDict`. The
12+
`column_mapping` between `data` or `target` and evaluator field names should now be specified inside this new dictionary:
13+
14+
Before:
15+
```python
16+
evaluate(
17+
...,
18+
evaluator_config={
19+
"hate_unfairness": {
20+
"query": "${data.question}",
21+
"response": "${data.answer}",
22+
}
23+
},
24+
...
25+
)
26+
```
27+
28+
After
29+
```python
30+
evaluate(
31+
...,
32+
evaluator_config={
33+
"hate_unfairness": {
34+
"column_mapping": {
35+
"query": "${data.question}",
36+
"response": "${data.answer}",
37+
}
38+
}
39+
},
40+
...
41+
)
42+
```
43+
1144
### Bugs Fixed
1245

1346
### Other Changes

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,12 @@
2525
from ._evaluators._rouge import RougeScoreEvaluator, RougeType
2626
from ._evaluators._similarity import SimilarityEvaluator
2727
from ._evaluators._xpia import IndirectAttackEvaluator
28-
from ._model_configurations import AzureAIProject, AzureOpenAIModelConfiguration, OpenAIModelConfiguration
28+
from ._model_configurations import (
29+
AzureAIProject,
30+
AzureOpenAIModelConfiguration,
31+
OpenAIModelConfiguration,
32+
EvaluatorConfig,
33+
)
2934

3035
__all__ = [
3136
"evaluate",
@@ -53,4 +58,5 @@
5358
"AzureAIProject",
5459
"AzureOpenAIModelConfiguration",
5560
"OpenAIModelConfiguration",
61+
"EvaluatorConfig",
5662
]

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py

Lines changed: 47 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
Prefixes,
2020
_InternalEvaluationMetrics,
2121
)
22-
from .._model_configurations import AzureAIProject
22+
from .._model_configurations import AzureAIProject, EvaluatorConfig
2323
from .._user_agent import USER_AGENT
2424
from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
2525
from ._utils import (
@@ -273,7 +273,7 @@ def _validate_columns(
273273
df: pd.DataFrame,
274274
evaluators: Dict[str, Any],
275275
target: Optional[Callable],
276-
evaluator_config: Dict[str, Dict[str, str]],
276+
column_mapping: Dict[str, Dict[str, str]],
277277
) -> None:
278278
"""
279279
Check that all columns needed by evaluator or target function are present.
@@ -284,8 +284,8 @@ def _validate_columns(
284284
:type evaluators: Dict[str, Any]
285285
:param target: The callable to be applied to data set.
286286
:type target: Optional[Callable]
287-
:param evaluator_config: The configuration for evaluators.
288-
:type evaluator_config: Dict[str, Dict[str, str]]
287+
:param column_mapping: Dictionary mapping evaluator name to evaluator column mapping
288+
:type column_mapping: Dict[str, Dict[str, str]]
289289
:raises EvaluationException: If column starts from "__outputs." while target is defined.
290290
"""
291291
if target:
@@ -306,7 +306,7 @@ def _validate_columns(
306306
else:
307307
for evaluator_name, evaluator in evaluators.items():
308308
# Apply column mapping
309-
mapping_config = evaluator_config.get(evaluator_name, evaluator_config.get("default", None))
309+
mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
310310
new_df = _apply_column_mapping(df, mapping_config)
311311

312312
# Validate input data for evaluator
@@ -372,11 +372,11 @@ def _apply_target_to_data(
372372
return target_output, generated_columns, run
373373

374374

375-
def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
376-
"""Process evaluator_config to replace ${target.} with ${data.}
375+
def _process_column_mappings(column_mapping: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
376+
"""Process column_mapping to replace ${target.} with ${data.}
377377
378-
:param evaluator_config: The configuration for evaluators.
379-
:type evaluator_config: Dict[str, Dict[str, str]]
378+
:param column_mapping: The configuration for evaluators.
379+
:type column_mapping: Dict[str, Dict[str, str]]
380380
:return: The processed configuration.
381381
:rtype: Dict[str, Dict[str, str]]
382382
"""
@@ -385,15 +385,15 @@ def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]) -> Di
385385

386386
unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
387387

388-
if evaluator_config:
389-
for evaluator, mapping_config in evaluator_config.items():
388+
if column_mapping:
389+
for evaluator, mapping_config in column_mapping.items():
390390
if isinstance(mapping_config, dict):
391391
processed_config[evaluator] = {}
392392

393393
for map_to_key, map_value in mapping_config.items():
394394
# Check if there's any unexpected reference other than ${target.} or ${data.}
395395
if unexpected_references.search(map_value):
396-
msg = "Unexpected references detected in 'evaluator_config'. Ensure only ${target.} and ${data.} are used."
396+
msg = "Unexpected references detected in 'column_mapping'. Ensure only ${target.} and ${data.} are used."
397397
raise EvaluationException(
398398
message=msg,
399399
internal_message=msg,
@@ -439,7 +439,7 @@ def evaluate(
439439
evaluators: Dict[str, Callable],
440440
evaluation_name: Optional[str] = None,
441441
target: Optional[Callable] = None,
442-
evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
442+
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
443443
azure_ai_project: Optional[AzureAIProject] = None,
444444
output_path: Optional[str] = None,
445445
**kwargs,
@@ -458,10 +458,10 @@ def evaluate(
458458
:keyword target: Target to be evaluated. `target` and `data` both cannot be None
459459
:paramtype target: Optional[Callable]
460460
:keyword evaluator_config: Configuration for evaluators. The configuration should be a dictionary with evaluator
461-
names as keys and a dictionary of column mappings as values. The column mappings should be a dictionary with
462-
keys as the column names in the evaluator input and values as the column names in the input data or data
463-
generated by target.
464-
:paramtype evaluator_config: Optional[Dict[str, Dict[str, str]]
461+
names as keys and a values that are dictionaries containing the column mappings. The column mappings should
462+
be a dictionary with keys as the column names in the evaluator input and values as the column names in the
463+
input data or data generated by target.
464+
:paramtype evaluator_config: Optional[Dict[str, ~azure.ai.evaluation.EvaluatorConfig]]
465465
:keyword output_path: The local folder or file path to save evaluation results to if set. If folder path is provided
466466
the results will be saved to a file named `evaluation_results.json` in the folder.
467467
:paramtype output_path: Optional[str]
@@ -482,7 +482,7 @@ def evaluate(
482482
model_config = {
483483
"azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
484484
"api_key": os.environ.get("AZURE_OPENAI_KEY"),
485-
"azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT")
485+
"azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
486486
}
487487
488488
coherence_eval = CoherenceEvaluator(model_config=model_config)
@@ -497,15 +497,19 @@ def evaluate(
497497
},
498498
evaluator_config={
499499
"coherence": {
500-
"response": "${data.response}",
501-
"query": "${data.query}"
500+
"column_mapping": {
501+
"response": "${data.response}",
502+
"query": "${data.query}",
503+
},
502504
},
503505
"relevance": {
504-
"response": "${data.response}",
505-
"context": "${data.context}",
506-
"query": "${data.query}"
507-
}
508-
}
506+
"column_mapping": {
507+
"response": "${data.response}",
508+
"context": "${data.context}",
509+
"query": "${data.query}",
510+
},
511+
},
512+
},
509513
)
510514
511515
"""
@@ -544,13 +548,13 @@ def evaluate(
544548
raise e
545549

546550

547-
def _evaluate( # pylint: disable=too-many-locals
551+
def _evaluate( # pylint: disable=too-many-locals,too-many-statements
548552
*,
549553
evaluation_name: Optional[str] = None,
550554
target: Optional[Callable] = None,
551555
data: Optional[str] = None,
552556
evaluators: Optional[Dict[str, Callable]] = None,
553-
evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
557+
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
554558
azure_ai_project: Optional[AzureAIProject] = None,
555559
output_path: Optional[str] = None,
556560
**kwargs,
@@ -560,8 +564,13 @@ def _evaluate( # pylint: disable=too-many-locals
560564
# Process evaluator config to replace ${target.} with ${data.}
561565
if evaluator_config is None:
562566
evaluator_config = {}
563-
evaluator_config = _process_evaluator_config(evaluator_config)
564-
_validate_columns(input_data_df, evaluators, target, evaluator_config)
567+
# extract column mapping dicts into dictionary mapping evaluator name to column mapping
568+
column_mapping = {
569+
evaluator_name: evaluator_configuration.get("column_mapping", None)
570+
for evaluator_name, evaluator_configuration in evaluator_config.items()
571+
}
572+
column_mapping = _process_column_mappings(column_mapping)
573+
_validate_columns(input_data_df, evaluators, target, column_mapping)
565574

566575
# Target Run
567576
pf_client = PFClient(
@@ -577,30 +586,30 @@ def _evaluate( # pylint: disable=too-many-locals
577586

578587
# Create default configuration for evaluators that directly maps
579588
# input data names to keyword inputs of the same name in the evaluators.
580-
evaluator_config = evaluator_config or {}
581-
evaluator_config.setdefault("default", {})
589+
column_mapping = column_mapping or {}
590+
column_mapping.setdefault("default", {})
582591

583592
# If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
584593
if data is not None and target is not None:
585594
input_data_df, target_generated_columns, target_run = _apply_target_to_data(
586595
target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
587596
)
588597

589-
for evaluator_name, mapping in evaluator_config.items():
598+
for evaluator_name, mapping in column_mapping.items():
590599
mapped_to_values = set(mapping.values())
591600
for col in target_generated_columns:
592601
# If user defined mapping differently, do not change it.
593602
# If it was mapped to target, we have already changed it
594-
# in _process_evaluator_config
603+
# in _process_column_mappings
595604
run_output = f"${{run.outputs.{col}}}"
596605
# We will add our mapping only if
597606
# customer did not mapped target output.
598607
if col not in mapping and run_output not in mapped_to_values:
599-
evaluator_config[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
608+
column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
600609

601610
# After we have generated all columns we can check if we have
602611
# everything we need for evaluators.
603-
_validate_columns(input_data_df, evaluators, target=None, evaluator_config=evaluator_config)
612+
_validate_columns(input_data_df, evaluators, target=None, column_mapping=column_mapping)
604613

605614
# Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
606615
# via target mapping.
@@ -610,8 +619,8 @@ def _evaluate( # pylint: disable=too-many-locals
610619
for col in input_data_df.columns:
611620
# Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
612621
# Also ignore columns that are already in config, since they've been covered by target mapping.
613-
if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in evaluator_config["default"].keys():
614-
evaluator_config["default"][col] = f"${{data.{col}}}"
622+
if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
623+
column_mapping["default"][col] = f"${{data.{col}}}"
615624
# Batch Run
616625
evaluators_info = {}
617626
use_pf_client = kwargs.get("_use_pf_client", True)
@@ -632,7 +641,7 @@ def _evaluate( # pylint: disable=too-many-locals
632641
flow=evaluator,
633642
run=target_run,
634643
evaluator_name=evaluator_name,
635-
column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
644+
column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
636645
data=data,
637646
stream=True,
638647
name=kwargs.get("_run_name"),

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_model_configurations.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
44

5-
from typing import Literal, TypedDict
5+
from typing import Dict, Literal, TypedDict
66

77
from typing_extensions import NotRequired
88

@@ -46,3 +46,10 @@ class AzureAIProject(TypedDict):
4646
"""Azure resource group name of the project"""
4747
project_name: str
4848
"""Azure project name"""
49+
50+
51+
class EvaluatorConfig(TypedDict, total=False):
52+
"""Configuration for an evaluator"""
53+
54+
column_mapping: Dict[str, str]
55+
"""Dictionary mapping evaluator input name to column in data"""

sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -292,12 +292,12 @@ def test_evaluate_with_target(self, questions_file):
292292
None,
293293
{"default": {}},
294294
{"default": {}, "question_ev": {}},
295-
{"default": {"query": "${target.query}"}},
296-
{"default": {"query": "${data.query}"}},
297-
{"default": {}, "question_ev": {"query": "${data.query}"}},
298-
{"default": {}, "question_ev": {"query": "${target.query}"}},
299-
{"default": {}, "question_ev": {"another_question": "${target.query}"}},
300-
{"default": {"another_question": "${target.query}"}},
295+
{"default": {"column_mapping": {"query": "${target.query}"}}},
296+
{"default": {"column_mapping": {"query": "${data.query}"}}},
297+
{"default": {}, "question_ev": {"column_mapping": {"query": "${data.query}"}}},
298+
{"default": {}, "question_ev": {"column_mapping": {"query": "${target.query}"}}},
299+
{"default": {}, "question_ev": {"column_mapping": {"another_question": "${target.query}"}}},
300+
{"default": {"column_mapping": {"another_question": "${target.query}"}}},
301301
],
302302
)
303303
def test_evaluate_another_questions(self, questions_file, evaluation_config):
@@ -334,19 +334,25 @@ def test_evaluate_another_questions(self, questions_file, evaluation_config):
334334
(
335335
{
336336
"f1_score": {
337-
"response": "${data.context}",
338-
"ground_truth": "${data.ground_truth}",
337+
"column_mapping": {
338+
"response": "${data.context}",
339+
"ground_truth": "${data.ground_truth}",
340+
}
339341
},
340342
"answer": {
341-
"response": "${target.response}",
343+
"column_mapping": {
344+
"response": "${target.response}",
345+
}
342346
},
343347
}
344348
),
345349
(
346350
{
347351
"default": {
348-
"response": "${target.response}",
349-
"ground_truth": "${data.ground_truth}",
352+
"column_mapping": {
353+
"response": "${target.response}",
354+
"ground_truth": "${data.ground_truth}",
355+
}
350356
},
351357
}
352358
),

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -330,11 +330,11 @@ def test_evaluate_invalid_evaluator_config(self, mock_model_config, evaluate_tes
330330
evaluate(
331331
data=evaluate_test_data_jsonl_file,
332332
evaluators={"g": GroundednessEvaluator(model_config=mock_model_config)},
333-
evaluator_config={"g": {"query": "${foo.query}"}},
333+
evaluator_config={"g": {"column_mapping": {"query": "${foo.query}"}}},
334334
)
335335

336336
assert (
337-
"Unexpected references detected in 'evaluator_config'. Ensure only ${target.} and ${data.} are used."
337+
"Unexpected references detected in 'column_mapping'. Ensure only ${target.} and ${data.} are used."
338338
in exc_info.value.args[0]
339339
)
340340

0 commit comments

Comments
 (0)