Add task_navigation_efficiency_label Output (#43332)

m7md7sien · web-flow · commit a9741f5cfa61 · 2025-10-27T14:09:35.000-07:00
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py
@@ -64,7 +64,7 @@ class TaskNavigationEfficiencyEvaluator(EvaluatorBase):
 
         .. code-block:: python
 
-            from azure.ai.evaluation import TaskNavigationEfficiencyEvaluator
+            from azure.ai.evaluation._evaluators._task_navigation_efficiency import TaskNavigationEfficiencyEvaluator
 
             task_navigation_efficiency_eval = TaskNavigationEfficiencyEvaluator(
                 matching_mode=TaskNavigationEfficiencyMatchingMode.EXACT_MATCH
@@ -320,8 +320,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, Dict[s
             )
 
             return {
+                "task_navigation_efficiency_label": match_result,
                 "task_navigation_efficiency_result": EVALUATION_PASS_FAIL_MAPPING[match_result],
-                "properties": additional_properties_metrics,
+                "task_navigation_efficiency_details": additional_properties_metrics,
             }
         else:
             raise EvaluationException(
diff --git a/sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/task_navigation_efficiency.ipynb b/sdk/evaluation/azure-ai-evaluation/samples/agent_evaluators/task_navigation_efficiency.ipynb
@@ -449,7 +449,7 @@
     "    \n",
     "    # Display the returned results\n",
     "    for key, value in result.items():\n",
-    "        if key == \"properties\":\n",
+    "        if key == \"task_navigation_efficiency_details\":\n",
     "            print(f\"  {key}:\")\n",
     "            for prop_key, prop_value in value.items():\n",
     "                print(f\"    {prop_key}: {prop_value:.3f}\")\n",
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py
@@ -29,10 +29,10 @@ def test_exact_match_scenario(self):
 
         result = evaluator(response=response, ground_truth=ground_truth)
         assert result["task_navigation_efficiency_result"] == "pass"
-        assert "properties" in result
-        assert result["properties"]["precision_score"] == 1.0
-        assert result["properties"]["recall_score"] == 1.0
-        assert result["properties"]["f1_score"] == 1.0
+        assert "task_navigation_efficiency_details" in result
+        assert result["task_navigation_efficiency_details"]["precision_score"] == 1.0
+        assert result["task_navigation_efficiency_details"]["recall_score"] == 1.0
+        assert result["task_navigation_efficiency_details"]["f1_score"] == 1.0
 
     def test_in_order_match_with_extra_steps(self):
         """Test when agent has extra steps but maintains order."""
@@ -60,9 +60,9 @@ def test_in_order_match_with_extra_steps(self):
 
         result = evaluator(response=response, ground_truth=ground_truth)
         assert result["task_navigation_efficiency_result"] == "pass"
-        assert result["properties"]["precision_score"] == 0.75  # 3/4
-        assert result["properties"]["recall_score"] == 1.0  # 3/3
-        assert result["properties"]["f1_score"] == pytest.approx(0.857, rel=1e-2)
+        assert result["task_navigation_efficiency_details"]["precision_score"] == 0.75  # 3/4
+        assert result["task_navigation_efficiency_details"]["recall_score"] == 1.0  # 3/3
+        assert result["task_navigation_efficiency_details"]["f1_score"] == pytest.approx(0.857, rel=1e-2)
 
     def test_any_order_match(self):
         """Test when agent has all steps but in wrong order."""
@@ -88,9 +88,9 @@ def test_any_order_match(self):
 
         result = evaluator(response=response, ground_truth=ground_truth)
         assert result["task_navigation_efficiency_result"] == "pass"
-        assert result["properties"]["precision_score"] == 1.0
-        assert result["properties"]["recall_score"] == 1.0
-        assert result["properties"]["f1_score"] == 1.0
+        assert result["task_navigation_efficiency_details"]["precision_score"] == 1.0
+        assert result["task_navigation_efficiency_details"]["recall_score"] == 1.0
+        assert result["task_navigation_efficiency_details"]["f1_score"] == 1.0
 
     def test_exact_match_failure(self):
         """Test when exact match fails but other matches succeed."""
@@ -159,9 +159,9 @@ def test_tuple_format_with_parameters(self):
 
         result = evaluator(response=response, ground_truth=ground_truth)
         assert result["task_navigation_efficiency_result"] == "pass"
-        assert result["properties"]["precision_score"] == 1.0
-        assert result["properties"]["recall_score"] == 1.0
-        assert result["properties"]["f1_score"] == 1.0
+        assert result["task_navigation_efficiency_details"]["precision_score"] == 1.0
+        assert result["task_navigation_efficiency_details"]["recall_score"] == 1.0
+        assert result["task_navigation_efficiency_details"]["f1_score"] == 1.0
 
     def test_matching_mode_validation(self):
         """Test validation of matching_mode parameter."""