33# ---------------------------------------------------------
44
55from collections import Counter
6- from typing import List
6+ from typing import List , Dict
7+ from typing_extensions import overload , override
78
8- from promptflow . _utils . async_utils import async_run_allowing_running_loop
9+ from azure . ai . evaluation . _evaluators . _common import EvaluatorBase
910
10- from azure .ai .evaluation ._exceptions import ErrorBlame , ErrorCategory , ErrorTarget , EvaluationException
1111
12+ class F1ScoreEvaluator (EvaluatorBase ):
13+ """
14+ Calculates the F1 score for a given response and ground truth or a multi-turn conversation.
1215
13- class _AsyncF1ScoreEvaluator :
14- def __init__ (self ):
15- pass
16+ F1 Scores range from 0 to 1, with 1 being the best possible score.
1617
17- async def __call__ (self , * , response : str , ground_truth : str , ** kwargs ):
18- """
19- Evaluate F1 score.
18+ The F1-score computes the ratio of the number of shared words between the model generation and
19+ the ground truth. Ratio is computed over the individual words in the generated response against those in the ground
20+ truth answer. The number of shared words between the generation and the truth is the basis of the F1 score:
21+ precision is the ratio of the number of shared words to the total number of words in the generation, and recall
22+ is the ratio of the number of shared words to the total number of words in the ground truth.
2023
21- :keyword response: The response to be evaluated.
22- :paramtype response: str
23- :keyword ground_truth: The ground truth to be evaluated.
24- :paramtype ground_truth: str
25- :return: The F1 score.
26- :rtype: Dict[str, float]
27- """
28- # Validate inputs
29- if not (response and response .strip () and response != "None" ) or not (
30- ground_truth and ground_truth .strip () and ground_truth != "None"
31- ):
32- msg = "Both 'response' and 'ground_truth' must be non-empty strings."
33- raise EvaluationException (
34- message = msg ,
35- internal_message = msg ,
36- error_category = ErrorCategory .MISSING_FIELD ,
37- error_blame = ErrorBlame .USER_ERROR ,
38- error_target = ErrorTarget .F1_EVALUATOR ,
39- )
24+ Use the F1 score when you want a single comprehensive metric that combines both recall and precision in your
25+ model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate
26+ information in the response.
4027
41- # Run f1 score computation.
42- f1_result = self ._compute_f1_score (response = response , ground_truth = ground_truth )
4328
44- return {"f1_score" : f1_result }
29+ .. admonition:: Example:
30+
31+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
32+ :start-after: [START f1_score_evaluator]
33+ :end-before: [END f1_score_evaluator]
34+ :language: python
35+ :dedent: 8
36+ :caption: Initialize and call an F1ScoreEvaluator.
37+ """
38+
39+ id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
40+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
41+
42+ def __init__ (self ):
43+ super ().__init__ ()
4544
4645 @classmethod
4746 def _compute_f1_score (cls , response : str , ground_truth : str ) -> float :
@@ -103,41 +102,24 @@ def lower(text):
103102
104103 return f1
105104
105+ @override
106+ async def _do_eval (self , eval_input : Dict ) -> Dict [str , float ]:
107+ """Produce an f1 score evaluation result.
106108
107- class F1ScoreEvaluator :
108- """
109- Calculates the F1 score for a given response and ground truth or a multi-turn conversation.
110-
111- F1 Scores range from 0 to 1, with 1 being the best possible score.
112-
113- The F1-score computes the ratio of the number of shared words between the model generation and
114- the ground truth. Ratio is computed over the individual words in the generated response against those in the ground
115- truth answer. The number of shared words between the generation and the truth is the basis of the F1 score:
116- precision is the ratio of the number of shared words to the total number of words in the generation, and recall
117- is the ratio of the number of shared words to the total number of words in the ground truth.
118-
119- Use the F1 score when you want a single comprehensive metric that combines both recall and precision in your
120- model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate
121- information in the response.
122-
123-
124- .. admonition:: Example:
125-
126- .. literalinclude:: ../samples/evaluation_samples_evaluate.py
127- :start-after: [START f1_score_evaluator]
128- :end-before: [END f1_score_evaluator]
129- :language: python
130- :dedent: 8
131- :caption: Initialize and call an F1ScoreEvaluator.
132- """
133-
134- id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
135- """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
109+ :param eval_input: The input to the evaluation function.
110+ :type eval_input: Dict
111+ :return: The evaluation result.
112+ :rtype: Dict
113+ """
114+ ground_truth = eval_input ["ground_truth" ]
115+ response = eval_input ["response" ]
116+ # Run f1 score computation.
117+ f1_result = self ._compute_f1_score (response = response , ground_truth = ground_truth )
136118
137- def __init__ (self ):
138- self ._async_evaluator = _AsyncF1ScoreEvaluator ()
119+ return {"f1_score" : f1_result }
139120
140- def __call__ (self , * , response : str , ground_truth : str , ** kwargs ):
121+ @overload # type: ignore
122+ def __call__ (self , * , response : str , ground_truth : str ) -> Dict [str , float ]:
141123 """
142124 Evaluate F1 score.
143125
@@ -149,9 +131,20 @@ def __call__(self, *, response: str, ground_truth: str, **kwargs):
149131 :rtype: Dict[str, float]
150132 """
151133
152- return async_run_allowing_running_loop (
153- self ._async_evaluator , response = response , ground_truth = ground_truth , ** kwargs
154- )
134+ @override
135+ def __call__ ( # pylint: disable=docstring-missing-param
136+ self ,
137+ * args ,
138+ ** kwargs ,
139+ ):
140+ """
141+ Evaluate F1 score.
155142
156- def _to_async (self ):
157- return self ._async_evaluator
143+ :keyword response: The response to be evaluated.
144+ :paramtype response: str
145+ :keyword ground_truth: The ground truth to be evaluated.
146+ :paramtype ground_truth: str
147+ :return: The F1 score.
148+ :rtype: Dict[str, float]
149+ """
150+ return super ().__call__ (* args , ** kwargs )
0 commit comments