Fix Flow Structure for Relevance and Response Completeness Evaluators (#43645)

m7md7sien · web-flow · commit 51176dfd195a · 2025-10-27T14:10:08.000-07:00
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
@@ -87,15 +87,13 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
     def __init__(self, model_config, *, credential=None, threshold=3, **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
-        self._threshold = threshold
-        self._higher_is_better = True
         super().__init__(
             model_config=model_config,
             prompty_file=prompty_path,
             result_key=self._RESULT_KEY,
             threshold=threshold,
             credential=credential,
-            _higher_is_better=self._higher_is_better,
+            _higher_is_better=True,
             **kwargs,
         )
 
@@ -178,7 +176,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         if not isinstance(eval_input["response"], str):
             eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
         result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
-        llm_output = result["llm_output"]
+        llm_output = result.get("llm_output")
         score = math.nan
 
         if isinstance(llm_output, dict):
@@ -188,10 +186,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
             binary_result = self._get_binary_result(score)
             return {
                 self._result_key: float(score),
-                f"gpt_{self._result_key}": float(score),
-                f"{self._result_key}_reason": reason,
                 f"{self._result_key}_result": binary_result,
                 f"{self._result_key}_threshold": self._threshold,
+                f"{self._result_key}_reason": reason,
                 f"{self._result_key}_prompt_tokens": result.get("input_token_count", 0),
                 f"{self._result_key}_completion_tokens": result.get("output_token_count", 0),
                 f"{self._result_key}_total_tokens": result.get("total_token_count", 0),
@@ -201,10 +198,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 f"{self._result_key}_sample_output": result.get("sample_output", ""),
             }
 
+        if logger:
+            logger.warning("LLM output is not a dictionary, returning NaN for the score.")
+
         binary_result = self._get_binary_result(score)
         return {
             self._result_key: float(score),
-            f"gpt_{self._result_key}": float(score),
             f"{self._result_key}_result": binary_result,
             f"{self._result_key}_threshold": self._threshold,
         }
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py
@@ -3,6 +3,7 @@
 # ---------------------------------------------------------
 
 import os
+import logging
 import math
 from typing import Dict, List, Union, Optional
 
@@ -14,6 +15,8 @@
 from azure.ai.evaluation._model_configurations import Conversation, Message
 from azure.ai.evaluation._common._experimental import experimental
 
+logger = logging.getLogger(__name__)
+
 
 @experimental
 class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
@@ -74,12 +77,14 @@ def __init__(
     ):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
-        self.threshold = threshold
+        self.threshold = threshold  # to be removed in favor of _threshold
         super().__init__(
             model_config=model_config,
             prompty_file=prompty_path,
             result_key=self._RESULT_KEY,
+            threshold=threshold,
             credential=credential,
+            _higher_is_better=True,
             **kwargs,
         )
 
@@ -156,20 +161,42 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
                 target=ErrorTarget.COMPLETENESS_EVALUATOR,
             )
 
-        llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
+        result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
+        llm_output = result.get("llm_output") if isinstance(result, dict) else result
 
         score = math.nan
-        if llm_output:
-            score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[1-5]")
+        llm_output_is_dict = isinstance(llm_output, dict)
+        if llm_output_is_dict or isinstance(llm_output, str):
+            reason = ""
+            if llm_output_is_dict:
+                score = float(llm_output.get("score", math.nan))
+                reason = llm_output.get("explanation", "")
+            else:
+                score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[1-5]")
 
-            score_result = "pass" if score >= self.threshold else "fail"
+            binary_result = self._get_binary_result(score)
 
             # updating the result key and threshold to int based on the schema
             return {
                 f"{self._result_key}": int(score),
-                f"{self._result_key}_result": score_result,
-                f"{self._result_key}_threshold": int(self.threshold),
+                f"{self._result_key}_result": binary_result,
+                f"{self._result_key}_threshold": int(self._threshold),
                 f"{self._result_key}_reason": reason,
+                f"{self._result_key}_prompt_tokens": result.get("input_token_count", 0),
+                f"{self._result_key}_completion_tokens": result.get("output_token_count", 0),
+                f"{self._result_key}_total_tokens": result.get("total_token_count", 0),
+                f"{self._result_key}_finish_reason": result.get("finish_reason", ""),
+                f"{self._result_key}_model": result.get("model_id", ""),
+                f"{self._result_key}_sample_input": result.get("sample_input", ""),
+                f"{self._result_key}_sample_output": result.get("sample_output", ""),
             }
 
-        return {self._result_key: math.nan}
+        if logger:
+            logger.warning("LLM output is not a dictionary, returning NaN for the score.")
+
+        binary_result = self._get_binary_result(score)
+        return {
+            self._result_key: float(score),
+            f"{self._result_key}_result": binary_result,
+            f"{self._result_key}_threshold": self._threshold,
+        }
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_completeness_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_completeness_evaluator.py
@@ -6,19 +6,29 @@
 
 
 async def completeness_response1_async_mock():
-    return """<S0>Let's think step by step: The ground truth states "The capital of Japan is Tokyo," which provides 
-    both the subject (capital of Japan) and the specific answer (Tokyo). The response, "The capital of Japan,
-    " only partially addresses the subject but does not provide the specific answer (Tokyo). This means it misses the 
-    core claim established in the ground truth.</S0> <S1>The response is fully incomplete as it does not provide the 
-    necessary and relevant information, specifically the name of the capital, Tokyo.</S1> <S2>1</S2>"""
+    return {
+        "llm_output": '<S0>Let\'s think step by step: The ground truth states "The capital of Japan is Tokyo." The response is "The capital of Japan." The response does not specify what the capital is; it only repeats part of the question and omits the key information ("Tokyo"). Therefore, none of the necessary information from the ground truth is present in the response.</S0>\n<S1>The response is fully incomplete because it does not provide the answer ("Tokyo") at all.</S1>\n<S2>1</S2>',
+        "input_token_count": 1354,
+        "output_token_count": 108,
+        "total_token_count": 1462,
+        "finish_reason": "stop",
+        "model_id": "gpt-4.1-2025-04-14",
+        "sample_input": '[{"role": "user", "content": "{\\"response\\": \\"The capital of Japan\\", \\"ground_truth\\": \\"The capital of Japan is Tokyo.\\"}"}]',
+        "sample_output": '[{"role": "assistant", "content": "<S0>Let\'s think step by step: The ground truth states \\"The capital of Japan is Tokyo.\\" The response is \\"The capital of Japan.\\" The response does not specify what the capital is; it only repeats part of the question and omits the key information (\\"Tokyo\\"). Therefore, none of the necessary information from the ground truth is present in the response.</S0>\\n<S1>The response is fully incomplete because it does not provide the answer (\\"Tokyo\\") at all.</S1>\\n<S2>1</S2>"}]',
+    }
 
 
 async def completeness_response2_async_mock():
-    return """<S0>Let's think step by step: The response states that the capital of Japan is Tokyo. The ground truth 
-    also states that the capital of Japan is Tokyo. Both the response and the ground truth are identical, containing 
-    all the necessary and relevant information. There is no missing or incorrect information in the response.</S0> 
-    <S1>The response perfectly matches the ground truth, containing all the necessary and relevant information 
-    without any omissions or errors.</S1> <S2>5</S2>"""
+    return {
+        "llm_output": '<S0>Let\'s think step by step: The ground truth contains a single statement: "The capital of Japan is Tokyo." The response exactly matches this statement without omitting or altering any information. There are no additional claims or missing details to consider. According to the definitions, a fully complete response should perfectly contain all necessary and relevant information from the ground truth.</S0>\n<S1>The response is a perfect match to the ground truth, with no missing or incorrect information.</S1>\n<S2>5</S2>',
+        "input_token_count": 1356,
+        "output_token_count": 107,
+        "total_token_count": 1463,
+        "finish_reason": "stop",
+        "model_id": "gpt-4.1-2025-04-14",
+        "sample_input": '[{"role": "user", "content": "{\\"response\\": \\"The capital of Japan is Tokyo.\\", \\"ground_truth\\": \\"The capital of Japan is Tokyo.\\"}"}]',
+        "sample_output": '[{"role": "assistant", "content": "<S0>Let\'s think step by step: The ground truth contains a single statement: \\"The capital of Japan is Tokyo.\\" The response exactly matches this statement without omitting or altering any information. There are no additional claims or missing details to consider. According to the definitions, a fully complete response should perfectly contain all necessary and relevant information from the ground truth.</S0>\\n<S1>The response is a perfect match to the ground truth, with no missing or incorrect information.</S1>\\n<S2>5</S2>"}]',
+    }
 
 
 @pytest.mark.usefixtures("mock_model_config")
@@ -81,7 +91,7 @@ def test_evaluate_completeness_valid2(self, mock_model_config):
         assert result[key] == 5
         assert result[f"{key}_result"] == "pass"
         assert result[f"{key}_threshold"] == ResponseCompletenessEvaluator._DEFAULT_COMPLETENESS_THRESHOLD
-        assert "The response perfectly matches " in result[f"{key}_reason"]
+        assert "The response is a perfect match " in result[f"{key}_reason"]
 
     def test_evaluate_completeness_valid3(self, mock_model_config):
         response_completeness_evaluator = ResponseCompletenessEvaluator(
@@ -103,7 +113,7 @@ def test_evaluate_completeness_valid3(self, mock_model_config):
         assert result[key] == 5
         assert result[f"{key}_result"] == "pass"
         assert result[f"{key}_threshold"] == ResponseCompletenessEvaluator._DEFAULT_COMPLETENESS_THRESHOLD
-        assert "The response perfectly matches " in result[f"{key}_reason"]
+        assert "The response is a perfect match " in result[f"{key}_reason"]
 
     def test_evaluate_completeness_missing_ground_truth(self, mock_model_config):
         response_completeness_evaluator = ResponseCompletenessEvaluator(model_config=mock_model_config)