Skip to content

Commit 8bff427

Browse files
authored
refactor math evals (Azure#38951)
* refactor math evals * fix tests, add fail flag to evaluate
1 parent 487d0b4 commit 8bff427

File tree

8 files changed

+315
-196
lines changed

8 files changed

+315
-196
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from .._constants import (
2121
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
2222
EvaluationMetrics,
23+
DefaultOpenEncoding,
2324
Prefixes,
2425
_InternalEvaluationMetrics,
2526
)
@@ -569,6 +570,7 @@ def evaluate(
569570
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
570571
azure_ai_project: Optional[AzureAIProject] = None,
571572
output_path: Optional[Union[str, os.PathLike]] = None,
573+
fail_on_evaluator_errors: bool = False,
572574
**kwargs,
573575
) -> EvaluationResult:
574576
"""Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
@@ -594,6 +596,11 @@ def evaluate(
594596
:paramtype output_path: Optional[str]
595597
:keyword azure_ai_project: Logs evaluation results to AI Studio if set.
596598
:paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
599+
:keyword fail_on_evaluator_errors: Whether or not the evaluation should cancel early with an EvaluationException
600+
if ANY evaluator fails during their evaluation.
601+
Defaults to false, which means that evaluations will continue regardless of failures.
602+
If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
603+
:paramtype fail_on_evaluator_errors: bool
597604
:return: Evaluation results.
598605
:rtype: ~azure.ai.evaluation.EvaluationResult
599606
@@ -615,6 +622,7 @@ def evaluate(
615622
evaluator_config=evaluator_config,
616623
azure_ai_project=azure_ai_project,
617624
output_path=output_path,
625+
fail_on_evaluator_errors=fail_on_evaluator_errors,
618626
**kwargs,
619627
)
620628
except Exception as e:
@@ -663,6 +671,16 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
663671
print("\n====================================================\n")
664672

665673

674+
def _print_fail_flag_warning() -> None:
675+
print(
676+
"Notice: fail_on_evaluator_errors is enabled. It is recommended that you disable "
677+
+ "this flag for evaluations on large datasets (loosely defined as more than 10 rows of inputs, "
678+
+ "or more than 4 evaluators). Using this flag on large datasets runs the risk of large runs failing "
679+
+ "without producing any outputs, since a single failure will cancel the entire run "
680+
"when fail_on_evaluator_errors is enabled."
681+
)
682+
683+
666684
def _evaluate( # pylint: disable=too-many-locals,too-many-statements
667685
*,
668686
evaluators: Dict[str, Callable],
@@ -672,8 +690,11 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
672690
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
673691
azure_ai_project: Optional[AzureAIProject] = None,
674692
output_path: Optional[Union[str, os.PathLike]] = None,
693+
fail_on_evaluator_errors: bool = False,
675694
**kwargs,
676695
) -> EvaluationResult:
696+
if fail_on_evaluator_errors:
697+
_print_fail_flag_warning()
677698
input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
678699

679700
# Process evaluator config to replace ${target.} with ${data.}
@@ -773,6 +794,10 @@ def eval_batch_run(
773794
evaluators_result_df = None
774795
evaluators_metric = {}
775796
for evaluator_name, evaluator_result in per_evaluator_results.items():
797+
if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
798+
_print_summary(per_evaluator_results)
799+
_turn_error_logs_into_exception(evaluator_result["run_summary"]["log_path"] + "/error.json")
800+
776801
evaluator_result_df = evaluator_result["result"]
777802

778803
# drop input columns
@@ -825,3 +850,20 @@ def eval_batch_run(
825850
_write_output(output_path, result)
826851

827852
return result
853+
854+
855+
def _turn_error_logs_into_exception(log_path: str) -> None:
856+
"""Produce an EvaluationException using the contents of the inputted
857+
file as the error message.
858+
859+
:param log_path: The path to the error log file.
860+
:type log_path: str
861+
"""
862+
with open(log_path, "r", encoding=DefaultOpenEncoding.READ) as file:
863+
error_message = file.read()
864+
raise EvaluationException(
865+
message=error_message,
866+
target=ErrorTarget.EVALUATE,
867+
category=ErrorCategory.FAILED_EXECUTION,
868+
blame=ErrorBlame.UNKNOWN,
869+
)
Lines changed: 46 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,16 @@
11
# ---------------------------------------------------------
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
4+
from typing import Dict
45
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
5-
from promptflow._utils.async_utils import async_run_allowing_running_loop
6+
from typing_extensions import overload, override
67

78
from azure.ai.evaluation._common.utils import nltk_tokenize
89

10+
from azure.ai.evaluation._evaluators._common import EvaluatorBase
911

10-
class _AsyncBleuScoreEvaluator:
11-
def __init__(self):
12-
pass
13-
14-
async def __call__(self, *, response: str, ground_truth: str, **kwargs):
15-
reference_tokens = nltk_tokenize(ground_truth)
16-
hypothesis_tokens = nltk_tokenize(response)
17-
18-
# NIST Smoothing
19-
smoothing_function = SmoothingFunction().method4
20-
score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
21-
22-
return {
23-
"bleu_score": score,
24-
}
2512

26-
27-
class BleuScoreEvaluator:
13+
class BleuScoreEvaluator(EvaluatorBase):
2814
"""
2915
Calculate the BLEU score for a given response and ground truth.
3016
@@ -51,9 +37,32 @@ class BleuScoreEvaluator:
5137
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
5238

5339
def __init__(self):
54-
self._async_evaluator = _AsyncBleuScoreEvaluator()
40+
super().__init__()
41+
42+
@override
43+
async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
44+
"""Produce a glue score evaluation result.
45+
46+
:param eval_input: The input to the evaluation function.
47+
:type eval_input: Dict
48+
:return: The evaluation result.
49+
:rtype: Dict
50+
"""
51+
ground_truth = eval_input["ground_truth"]
52+
response = eval_input["response"]
53+
reference_tokens = nltk_tokenize(ground_truth)
54+
hypothesis_tokens = nltk_tokenize(response)
5555

56-
def __call__(self, *, response: str, ground_truth: str, **kwargs):
56+
# NIST Smoothing
57+
smoothing_function = SmoothingFunction().method4
58+
score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
59+
60+
return {
61+
"bleu_score": score,
62+
}
63+
64+
@overload # type: ignore
65+
def __call__(self, *, response: str, ground_truth: str):
5766
"""
5867
Evaluate the BLEU score between the response and the ground truth.
5968
@@ -64,9 +73,21 @@ def __call__(self, *, response: str, ground_truth: str, **kwargs):
6473
:return: The BLEU score.
6574
:rtype: Dict[str, float]
6675
"""
67-
return async_run_allowing_running_loop(
68-
self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
69-
)
7076

71-
def _to_async(self):
72-
return self._async_evaluator
77+
@override
78+
def __call__( # pylint: disable=docstring-missing-param
79+
self,
80+
*args,
81+
**kwargs,
82+
):
83+
"""
84+
Evaluate the BLEU score between the response and the ground truth.
85+
86+
:keyword response: The response to be evaluated.
87+
:paramtype response: str
88+
:keyword ground_truth: The ground truth to be compared against.
89+
:paramtype ground_truth: str
90+
:return: The BLEU score.
91+
:rtype: Dict[str, float]
92+
"""
93+
return super().__call__(*args, **kwargs)

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py

Lines changed: 61 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -3,45 +3,44 @@
33
# ---------------------------------------------------------
44

55
from collections import Counter
6-
from typing import List
6+
from typing import List, Dict
7+
from typing_extensions import overload, override
78

8-
from promptflow._utils.async_utils import async_run_allowing_running_loop
9+
from azure.ai.evaluation._evaluators._common import EvaluatorBase
910

10-
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
1111

12+
class F1ScoreEvaluator(EvaluatorBase):
13+
"""
14+
Calculates the F1 score for a given response and ground truth or a multi-turn conversation.
1215
13-
class _AsyncF1ScoreEvaluator:
14-
def __init__(self):
15-
pass
16+
F1 Scores range from 0 to 1, with 1 being the best possible score.
1617
17-
async def __call__(self, *, response: str, ground_truth: str, **kwargs):
18-
"""
19-
Evaluate F1 score.
18+
The F1-score computes the ratio of the number of shared words between the model generation and
19+
the ground truth. Ratio is computed over the individual words in the generated response against those in the ground
20+
truth answer. The number of shared words between the generation and the truth is the basis of the F1 score:
21+
precision is the ratio of the number of shared words to the total number of words in the generation, and recall
22+
is the ratio of the number of shared words to the total number of words in the ground truth.
2023
21-
:keyword response: The response to be evaluated.
22-
:paramtype response: str
23-
:keyword ground_truth: The ground truth to be evaluated.
24-
:paramtype ground_truth: str
25-
:return: The F1 score.
26-
:rtype: Dict[str, float]
27-
"""
28-
# Validate inputs
29-
if not (response and response.strip() and response != "None") or not (
30-
ground_truth and ground_truth.strip() and ground_truth != "None"
31-
):
32-
msg = "Both 'response' and 'ground_truth' must be non-empty strings."
33-
raise EvaluationException(
34-
message=msg,
35-
internal_message=msg,
36-
error_category=ErrorCategory.MISSING_FIELD,
37-
error_blame=ErrorBlame.USER_ERROR,
38-
error_target=ErrorTarget.F1_EVALUATOR,
39-
)
24+
Use the F1 score when you want a single comprehensive metric that combines both recall and precision in your
25+
model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate
26+
information in the response.
4027
41-
# Run f1 score computation.
42-
f1_result = self._compute_f1_score(response=response, ground_truth=ground_truth)
4328
44-
return {"f1_score": f1_result}
29+
.. admonition:: Example:
30+
31+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
32+
:start-after: [START f1_score_evaluator]
33+
:end-before: [END f1_score_evaluator]
34+
:language: python
35+
:dedent: 8
36+
:caption: Initialize and call an F1ScoreEvaluator.
37+
"""
38+
39+
id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
40+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
41+
42+
def __init__(self):
43+
super().__init__()
4544

4645
@classmethod
4746
def _compute_f1_score(cls, response: str, ground_truth: str) -> float:
@@ -103,41 +102,24 @@ def lower(text):
103102

104103
return f1
105104

105+
@override
106+
async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
107+
"""Produce an f1 score evaluation result.
106108
107-
class F1ScoreEvaluator:
108-
"""
109-
Calculates the F1 score for a given response and ground truth or a multi-turn conversation.
110-
111-
F1 Scores range from 0 to 1, with 1 being the best possible score.
112-
113-
The F1-score computes the ratio of the number of shared words between the model generation and
114-
the ground truth. Ratio is computed over the individual words in the generated response against those in the ground
115-
truth answer. The number of shared words between the generation and the truth is the basis of the F1 score:
116-
precision is the ratio of the number of shared words to the total number of words in the generation, and recall
117-
is the ratio of the number of shared words to the total number of words in the ground truth.
118-
119-
Use the F1 score when you want a single comprehensive metric that combines both recall and precision in your
120-
model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate
121-
information in the response.
122-
123-
124-
.. admonition:: Example:
125-
126-
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
127-
:start-after: [START f1_score_evaluator]
128-
:end-before: [END f1_score_evaluator]
129-
:language: python
130-
:dedent: 8
131-
:caption: Initialize and call an F1ScoreEvaluator.
132-
"""
133-
134-
id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
135-
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
109+
:param eval_input: The input to the evaluation function.
110+
:type eval_input: Dict
111+
:return: The evaluation result.
112+
:rtype: Dict
113+
"""
114+
ground_truth = eval_input["ground_truth"]
115+
response = eval_input["response"]
116+
# Run f1 score computation.
117+
f1_result = self._compute_f1_score(response=response, ground_truth=ground_truth)
136118

137-
def __init__(self):
138-
self._async_evaluator = _AsyncF1ScoreEvaluator()
119+
return {"f1_score": f1_result}
139120

140-
def __call__(self, *, response: str, ground_truth: str, **kwargs):
121+
@overload # type: ignore
122+
def __call__(self, *, response: str, ground_truth: str) -> Dict[str, float]:
141123
"""
142124
Evaluate F1 score.
143125
@@ -149,9 +131,20 @@ def __call__(self, *, response: str, ground_truth: str, **kwargs):
149131
:rtype: Dict[str, float]
150132
"""
151133

152-
return async_run_allowing_running_loop(
153-
self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
154-
)
134+
@override
135+
def __call__( # pylint: disable=docstring-missing-param
136+
self,
137+
*args,
138+
**kwargs,
139+
):
140+
"""
141+
Evaluate F1 score.
155142
156-
def _to_async(self):
157-
return self._async_evaluator
143+
:keyword response: The response to be evaluated.
144+
:paramtype response: str
145+
:keyword ground_truth: The ground truth to be evaluated.
146+
:paramtype ground_truth: str
147+
:return: The F1 score.
148+
:rtype: Dict[str, float]
149+
"""
150+
return super().__call__(*args, **kwargs)

0 commit comments

Comments
 (0)