Skip to content

Commit f1b0ea1

Browse files
authored
Improve e2e evaluator testing. (Azure#38758)
* enable glue * add similarity to e2e testing and refactor to use base class * analysis and unit test fixes * run black * re-enabled no PF QA and image multi * run black * re record new tests * undo image with targets * run black * undo not using pf client * remove qa again * run black * lowere PF performance reqs * update recordings
1 parent f79382f commit f1b0ea1

File tree

6 files changed

+97
-129
lines changed

6 files changed

+97
-129
lines changed

sdk/evaluation/azure-ai-evaluation/assets.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
"AssetsRepo": "Azure/azure-sdk-assets",
33
"AssetsRepoPrefixPath": "python",
44
"TagPrefix": "python/evaluation/azure-ai-evaluation",
5-
"Tag": "python/evaluation/azure-ai-evaluation_5ad4de0f7c"
5+
"Tag": "python/evaluation/azure-ai-evaluation_4f3f9f39dc"
66
}

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ class DerivedEvalInput(TypedDict, total=False):
2525
query: Dict[str, Any]
2626
response: Dict[str, Any]
2727
context: str
28+
ground_truth: str
2829

2930

3031
AggregateResult: TypeAlias = Dict[str, Union[float, Dict[str, List[T]]]]
@@ -158,6 +159,7 @@ def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInp
158159
include_context = "context" in self._singleton_inputs
159160
include_query = "query" in self._singleton_inputs
160161
include_response = "response" in self._singleton_inputs
162+
include_ground_truth = "ground_truth" in self._singleton_inputs
161163

162164
def converter(conversation: Dict) -> List[DerivedEvalInput]:
163165
messages = cast(List[Dict[str, Any]], conversation["messages"])
@@ -198,6 +200,8 @@ def converter(conversation: Dict) -> List[DerivedEvalInput]:
198200
eval_input["response"] = response.get("content", "")
199201
if include_context:
200202
eval_input["context"] = str(context)
203+
if include_ground_truth:
204+
eval_input["ground_truth"] = response.get("ground_truth", "")
201205
eval_inputs.append(eval_input)
202206
return eval_inputs
203207

@@ -402,7 +406,9 @@ def __init__(self, real_call): # DO NOT ADD TYPEHINT PROMPT FLOW WILL SCREAM AT
402406
# are just not passed into this function instead of ending up in kwargs.
403407
# Since we want this to be relatively call-agnostic, we just account for every input that any children
404408
# are known to throw at this, mash them into kwargs, and then pass them into the real call.
405-
async def __call__(self, *, query=None, response=None, context=None, conversation=None, **kwargs):
409+
async def __call__(
410+
self, *, query=None, response=None, context=None, conversation=None, ground_truth=None, **kwargs
411+
):
406412
if conversation is not None:
407413
kwargs["conversation"] = conversation
408414
if query is not None:
@@ -411,4 +417,6 @@ async def __call__(self, *, query=None, response=None, context=None, conversatio
411417
kwargs["response"] = response
412418
if context is not None:
413419
kwargs["context"] = context
420+
if ground_truth is not None:
421+
kwargs["ground_truth"] = ground_truth
414422
return await self._real_call(**kwargs)

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/_similarity.py

Lines changed: 41 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -2,83 +2,15 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
44

5-
import math
65
import os
7-
import re
6+
from typing import Dict
87

9-
from promptflow._utils.async_utils import async_run_allowing_running_loop
10-
from promptflow.core import AsyncPrompty
8+
from typing_extensions import overload, override
119

12-
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
10+
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
1311

14-
from ..._common.utils import construct_prompty_model_config, validate_model_config
1512

16-
try:
17-
from ..._user_agent import USER_AGENT
18-
except ImportError:
19-
USER_AGENT = "None"
20-
21-
22-
class _AsyncSimilarityEvaluator:
23-
# Constants must be defined within eval's directory to be save/loadable
24-
_PROMPTY_FILE = "similarity.prompty"
25-
_LLM_CALL_TIMEOUT = 600
26-
_DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
27-
28-
def __init__(self, model_config: dict):
29-
prompty_model_config = construct_prompty_model_config(
30-
validate_model_config(model_config),
31-
self._DEFAULT_OPEN_API_VERSION,
32-
USER_AGENT,
33-
)
34-
35-
current_dir = os.path.dirname(__file__)
36-
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
37-
self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
38-
39-
async def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
40-
"""
41-
Evaluate similarity.
42-
43-
:keyword query: The query to be evaluated.
44-
:paramtype query: str
45-
:keyword response: The response to be evaluated.
46-
:paramtype response: str
47-
:keyword ground_truth: The ground truth to be evaluated.
48-
:paramtype ground_truth: str
49-
:return: The similarity score.
50-
:rtype: Dict[str, float]
51-
"""
52-
# Validate input parameters
53-
query = str(query or "")
54-
response = str(response or "")
55-
ground_truth = str(ground_truth or "")
56-
57-
if not (query.strip() and response.strip() and ground_truth.strip()):
58-
msg = "'query', 'response' and 'ground_truth' must be non-empty strings."
59-
raise EvaluationException(
60-
message=msg,
61-
internal_message=msg,
62-
error_category=ErrorCategory.MISSING_FIELD,
63-
error_blame=ErrorBlame.USER_ERROR,
64-
error_target=ErrorTarget.SIMILARITY_EVALUATOR,
65-
)
66-
67-
# Run the evaluation flow
68-
llm_output = await self._flow(
69-
query=query, response=response, ground_truth=ground_truth, timeout=self._LLM_CALL_TIMEOUT, **kwargs
70-
)
71-
72-
score = math.nan
73-
if llm_output:
74-
match = re.search(r"\d", llm_output)
75-
if match:
76-
score = float(match.group())
77-
78-
return {"similarity": float(score), "gpt_similarity": float(score)}
79-
80-
81-
class SimilarityEvaluator:
13+
class SimilarityEvaluator(PromptyEvaluatorBase):
8214
"""
8315
Evaluates similarity score for a given query, response, and ground truth.
8416
@@ -113,13 +45,27 @@ class SimilarityEvaluator:
11345
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
11446
"""
11547

116-
id = "azureml://registries/azureml/models/Similarity-Evaluator/versions/3"
48+
# Constants must be defined within eval's directory to be save/loadable
49+
50+
_PROMPTY_FILE = "similarity.prompty"
51+
_RESULT_KEY = "similarity"
52+
53+
id = "similarity"
11754
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
11855

56+
@override
11957
def __init__(self, model_config):
120-
self._async_evaluator = _AsyncSimilarityEvaluator(model_config)
121-
122-
def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
58+
current_dir = os.path.dirname(__file__)
59+
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
60+
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
61+
62+
# Ignoring a mypy error about having only 1 overload function.
63+
# We want to use the overload style for all evals, even single-inputs. This is both to make
64+
# refactoring to multi-input styles easier, stylistic consistency consistency across evals,
65+
# and due to the fact that non-overloaded syntax now causes various parsing issues that
66+
# we don't want to deal with.
67+
@overload # type: ignore
68+
def __call__(self, *, query: str, response: str, ground_truth: str) -> Dict[str, float]:
12369
"""
12470
Evaluate similarity.
12571
@@ -132,9 +78,23 @@ def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
13278
:return: The similarity score.
13379
:rtype: Dict[str, float]
13480
"""
135-
return async_run_allowing_running_loop(
136-
self._async_evaluator, query=query, response=response, ground_truth=ground_truth, **kwargs
137-
)
13881

139-
def _to_async(self):
140-
return self._async_evaluator
82+
@override
83+
def __call__( # pylint: disable=docstring-missing-param
84+
self,
85+
*args,
86+
**kwargs,
87+
):
88+
"""
89+
Evaluate similarity.
90+
91+
:keyword query: The query to be evaluated.
92+
:paramtype query: str
93+
:keyword response: The response to be evaluated.
94+
:paramtype response: str
95+
:keyword ground_truth: The ground truth to be evaluated.
96+
:paramtype ground_truth: str
97+
:return: The similarity score.
98+
:rtype: Dict[str, float]
99+
"""
100+
return super().__call__(*args, **kwargs)

sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_mass_evaluate.py

Lines changed: 43 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,19 @@
55
import pathlib
66
import pandas as pd
77
import pytest
8+
from regex import F
89

910

1011
from azure.ai.evaluation import (
1112
F1ScoreEvaluator,
12-
# GleuScoreEvaluator,
13+
GleuScoreEvaluator,
1314
BleuScoreEvaluator,
1415
RougeScoreEvaluator,
1516
MeteorScoreEvaluator,
1617
CoherenceEvaluator,
1718
FluencyEvaluator,
1819
RelevanceEvaluator,
19-
# SimilarityEvaluator,
20+
SimilarityEvaluator,
2021
GroundednessEvaluator,
2122
# QAEvaluator,
2223
ContentSafetyEvaluator,
@@ -74,21 +75,20 @@ class TestMassEvaluate:
7475
"""
7576

7677
def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope, data_file):
77-
# qa and similarity disabled due to being playback-unfriendly due to URL sanitization problems.
78-
# glue disabled due to being unfriendly to CI playback for some reason.
79-
# content safety disabled temporarily to test CI PF teardown race condition
78+
# qa fails in playback but ONLY when using the pf proxy for some reason, and
79+
# using it without pf proxy causes CI to hang and timeout after 3 hours.
8080
evaluators = {
8181
"f1_score": F1ScoreEvaluator(),
82-
# "gleu": GleuScoreEvaluator(),
82+
"gleu": GleuScoreEvaluator(),
8383
"bleu": BleuScoreEvaluator(),
8484
"rouge": RougeScoreEvaluator(RougeType.ROUGE_L),
8585
"meteor": MeteorScoreEvaluator(),
8686
"grounded": GroundednessEvaluator(model_config),
8787
"coherence": CoherenceEvaluator(model_config),
8888
"fluency": FluencyEvaluator(model_config),
8989
"relevance": RelevanceEvaluator(model_config),
90-
# "similarity": SimilarityEvaluator(model_config),
91-
# "qa" : QAEvaluator(model_config),
90+
"similarity": SimilarityEvaluator(model_config),
91+
# "qa": QAEvaluator(model_config),
9292
"grounded_pro": GroundednessProEvaluator(azure_cred, project_scope),
9393
"protected_material": ProtectedMaterialEvaluator(azure_cred, project_scope),
9494
"indirect_attack": IndirectAttackEvaluator(azure_cred, project_scope),
@@ -105,13 +105,13 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
105105
row_result_df = pd.DataFrame(result["rows"])
106106
metrics = result["metrics"]
107107

108-
assert len(row_result_df.keys()) == 45 # 63 with gleu, qa/similarity
108+
assert len(row_result_df.keys()) == 48 # 63 with qa
109109
assert len(row_result_df["inputs.query"]) == 3
110110
assert len(row_result_df["inputs.context"]) == 3
111111
assert len(row_result_df["inputs.response"]) == 3
112112
assert len(row_result_df["inputs.ground_truth"]) == 3
113113
assert len(row_result_df["outputs.f1_score.f1_score"]) == 3
114-
# assert len(row_result_df["outputs.gleu.gleu_score"]) == 3
114+
assert len(row_result_df["outputs.gleu.gleu_score"]) == 3
115115
assert len(row_result_df["outputs.bleu.bleu_score"]) == 3
116116
assert len(row_result_df["outputs.rouge.rouge_precision"]) == 3
117117
assert len(row_result_df["outputs.rouge.rouge_recall"]) == 3
@@ -129,23 +129,8 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
129129
assert len(row_result_df["outputs.relevance.relevance"]) == 3
130130
assert len(row_result_df["outputs.relevance.gpt_relevance"]) == 3
131131
assert len(row_result_df["outputs.relevance.relevance_reason"]) == 3
132-
# assert len(row_result_df['outputs.similarity.similarity']) == 3
133-
# assert len(row_result_df['outputs.similarity.gpt_similarity']) == 3
134-
# assert len(row_result_df['outputs.qa.f1_score']) == 3
135-
# assert len(row_result_df['outputs.qa.groundedness']) == 3
136-
# assert len(row_result_df['outputs.qa.gpt_groundedness']) == 3
137-
# assert len(row_result_df['outputs.qa.groundedness_reason']) == 3
138-
# assert len(row_result_df['outputs.qa.coherence']) == 3
139-
# assert len(row_result_df['outputs.qa.gpt_coherence']) == 3
140-
# assert len(row_result_df['outputs.qa.coherence_reason']) == 3
141-
# assert len(row_result_df['outputs.qa.fluency']) == 3
142-
# assert len(row_result_df['outputs.qa.gpt_fluency']) == 3
143-
# assert len(row_result_df['outputs.qa.fluency_reason']) == 3
144-
# assert len(row_result_df['outputs.qa.relevance']) == 3
145-
# assert len(row_result_df['outputs.qa.gpt_relevance']) == 3
146-
# assert len(row_result_df['outputs.qa.relevance_reason']) == 3
147-
# assert len(row_result_df['outputs.qa.similarity']) == 3
148-
# assert len(row_result_df['outputs.qa.gpt_similarity']) == 3
132+
assert len(row_result_df["outputs.similarity.similarity"]) == 3
133+
assert len(row_result_df["outputs.similarity.gpt_similarity"]) == 3
149134
assert len(row_result_df["outputs.grounded_pro.groundedness_pro_label"]) == 3
150135
assert len(row_result_df["outputs.grounded_pro.groundedness_pro_reason"]) == 3
151136
assert len(row_result_df["outputs.protected_material.protected_material_label"]) == 3
@@ -169,10 +154,25 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
169154
assert len(row_result_df["outputs.content_safety.violence"]) == 3
170155
assert len(row_result_df["outputs.content_safety.violence_score"]) == 3
171156
assert len(row_result_df["outputs.content_safety.violence_reason"]) == 3
157+
# assert len(row_result_df["outputs.qa.f1_score"]) == 3
158+
# assert len(row_result_df["outputs.qa.groundedness"]) == 3
159+
# assert len(row_result_df["outputs.qa.gpt_groundedness"]) == 3
160+
# assert len(row_result_df["outputs.qa.groundedness_reason"]) == 3
161+
# assert len(row_result_df["outputs.qa.coherence"]) == 3
162+
# assert len(row_result_df["outputs.qa.gpt_coherence"]) == 3
163+
# assert len(row_result_df["outputs.qa.coherence_reason"]) == 3
164+
# assert len(row_result_df["outputs.qa.fluency"]) == 3
165+
# assert len(row_result_df["outputs.qa.gpt_fluency"]) == 3
166+
# assert len(row_result_df["outputs.qa.fluency_reason"]) == 3
167+
# assert len(row_result_df["outputs.qa.relevance"]) == 3
168+
# assert len(row_result_df["outputs.qa.gpt_relevance"]) == 3
169+
# assert len(row_result_df["outputs.qa.relevance_reason"]) == 3
170+
# assert len(row_result_df["outputs.qa.similarity"]) == 3
171+
# assert len(row_result_df["outputs.qa.gpt_similarity"]) == 3
172172

173-
assert len(metrics.keys()) == 25 # 39 with gleu, qa, similarity
173+
assert len(metrics.keys()) == 28 # 39 with qa
174174
assert metrics["f1_score.f1_score"] >= 0
175-
# assert metrics["gleu.gleu_score"] >= 0
175+
assert metrics["gleu.gleu_score"] >= 0
176176
assert metrics["bleu.bleu_score"] >= 0
177177
assert metrics["rouge.rouge_precision"] >= 0
178178
assert metrics["rouge.rouge_recall"] >= 0
@@ -186,8 +186,8 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
186186
assert metrics["fluency.gpt_fluency"] >= 0
187187
assert metrics["relevance.relevance"] >= 0
188188
assert metrics["relevance.gpt_relevance"] >= 0
189-
# assert metrics['similarity.similarity'] >= 0
190-
# assert metrics['similarity.gpt_similarity'] >= 0
189+
assert metrics["similarity.similarity"] >= 0
190+
assert metrics["similarity.gpt_similarity"] >= 0
191191
assert metrics["indirect_attack.xpia_manipulated_content"] >= 0
192192
assert metrics["indirect_attack.xpia_intrusion"] >= 0
193193
assert metrics["indirect_attack.xpia_information_gathering"] >= 0
@@ -199,17 +199,17 @@ def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope
199199
assert metrics["protected_material.protected_material_defect_rate"] >= 0
200200
assert metrics["indirect_attack.xpia_defect_rate"] >= 0
201201
assert metrics["eci.eci_defect_rate"] >= 0
202-
# assert metrics['qa.f1_score'] >= 0
203-
# assert metrics['qa.groundedness'] >= 0
204-
# assert metrics['qa.gpt_groundedness'] >= 0
205-
# assert metrics['qa.coherence'] >= 0
206-
# assert metrics['qa.gpt_coherence'] >= 0
207-
# assert metrics['qa.fluency'] >= 0
208-
# assert metrics['qa.gpt_fluency'] >= 0
209-
# assert metrics['qa.relevance'] >= 0
210-
# assert metrics['qa.gpt_relevance'] >= 0
211-
# assert metrics['qa.similarity'] >= 0
212-
# assert metrics['qa.gpt_similarity'] >= 0
202+
# assert metrics["qa.f1_score"] >= 0
203+
# assert metrics["qa.groundedness"] >= 0
204+
# assert metrics["qa.gpt_groundedness"] >= 0
205+
# assert metrics["qa.coherence"] >= 0
206+
# assert metrics["qa.gpt_coherence"] >= 0
207+
# assert metrics["qa.fluency"] >= 0
208+
# assert metrics["qa.gpt_fluency"] >= 0
209+
# assert metrics["qa.relevance"] >= 0
210+
# assert metrics["qa.gpt_relevance"] >= 0
211+
# assert metrics["qa.similarity"] >= 0
212+
# assert metrics["qa.gpt_similarity"] >= 0
213213

214214
def test_evaluate_conversation(self, model_config, data_convo_file, azure_cred, project_scope):
215215
evaluators = {
@@ -291,7 +291,7 @@ def test_evaluate_conversation(self, model_config, data_convo_file, azure_cred,
291291
assert metrics["indirect_attack.xpia_defect_rate"] >= 0
292292
assert metrics["eci.eci_defect_rate"] >= 0
293293

294-
# Imagee urls with target is disabled due to being unstable in CI
294+
# Image urls with target is disabled due to being unstable in CI
295295
@pytest.mark.parametrize(
296296
"multi_modal_input_type,pm_evaluator_class,cs_evaluator_class",
297297
[

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def test_fluency_evaluator_empty_string(self, mock_model_config):
5757

5858
def test_similarity_evaluator_keys(self, mock_model_config):
5959
similarity_eval = SimilarityEvaluator(model_config=mock_model_config)
60-
similarity_eval._async_evaluator._flow = MagicMock(return_value=quality_no_response_async_mock())
60+
similarity_eval._flow = MagicMock(return_value=quality_no_response_async_mock())
6161

6262
result = similarity_eval(
6363
query="What is the capital of Japan?",

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate_performance.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def test_bulk_evaluate(self, big_f1_data_file, use_pf_client):
5050
if in_ci():
5151
max_duration += 25
5252
if use_pf_client: # PF client doesn't seem to parallelize, and takes about a second or 2 to start
53-
max_duration += 6.5
53+
max_duration += 7.5
5454
assert diff < max_duration
5555
row_result_df = pd.DataFrame(result["rows"])
5656
assert "outputs.f1.f1_score" in row_result_df.columns
@@ -76,7 +76,7 @@ def test_evaluate_parallelism(self, ten_queries_file, use_pf_client):
7676
# 2 batches at most, so it should take between 1 and 1.5 seconds.
7777
max_duration = 1.5
7878
if use_pf_client: # PF client doesn't seem to parallelize, and takes about a second to start.
79-
max_duration += 7.5
79+
max_duration += 8.5
8080
assert diff < max_duration
8181
row_result_df = pd.DataFrame(result["rows"])
8282
assert "outputs.slow.result" in row_result_df.columns

0 commit comments

Comments
 (0)