Fix result converter bug: handle null value for evaluation summary calculation (#43753)

YoYoJa · web-flow · commit f805a5b283f2 · 2025-11-03T11:05:53.000-08:00
* update

* rename

* run black

* fix result counts

* update

* Fix bug

* run black

* fix bug
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
@@ -2137,20 +2137,38 @@ def _convert_results_to_aoai_evaluation_results(
                 # Create result object for this criteria
                 metrics = testing_criteria_name_types_metrics.get(criteria_name, {}).get("metrics", [])
                 for metric in metrics:
-                    result_obj = {
-                        "type": testing_criteria_name_types_metrics.get(criteria_name, {}).get(
-                            "type", "azure_ai_evaluator"
-                        ),
-                        "name": criteria_name,  # Use criteria name as name
-                        "metric": metric if metric is not None else criteria_name,  # Use criteria name as metric
-                        "score": None,
-                        "label": None,
-                        "reason": None,
-                        "threshold": None,
-                        "passed": None,
-                        "sample": sample,
-                    }
-                    run_output_results.append(result_obj)
+                    should_add_error_summary = True
+                    for result in run_output_results:
+                        if result.get("name", None) == criteria_name and result.get("metric", None) == metric:
+                            rs_score = result.get("score", None)
+                            rs_threshold = result.get("threshold", None)
+                            rs_label = result.get("label", None)
+                            rs_reason = result.get("reason", None)
+                            if (
+                                _is_none_or_nan(rs_score)
+                                and _is_none_or_nan(rs_threshold)
+                                and _is_none_or_nan(rs_label)
+                                and _is_none_or_nan(rs_reason)
+                            ):
+                                run_output_results.remove(result)
+                            else:
+                                should_add_error_summary = False
+                            break  # Skip if already have result for this criteria and metric
+                    if should_add_error_summary:
+                        result_obj = {
+                            "type": testing_criteria_name_types_metrics.get(criteria_name, {}).get(
+                                "type", "azure_ai_evaluator"
+                            ),
+                            "name": criteria_name,  # Use criteria name as name
+                            "metric": metric if metric is not None else criteria_name,  # Use criteria name as metric
+                            "score": None,
+                            "label": None,
+                            "reason": None,
+                            "threshold": None,
+                            "passed": None,
+                            "sample": sample,
+                        }
+                        run_output_results.append(result_obj)
 
         # Create RunOutputItem structure
         run_output_item = {
@@ -2182,6 +2200,24 @@ def _convert_results_to_aoai_evaluation_results(
     )
 
 
+def _is_none_or_nan(value: Any) -> bool:
+    """
+    Check if a value is None or NaN.
+
+    :param value: The value to check
+    :type value: Any
+    :return: True if the value is None or NaN, False otherwise
+    :rtype: bool
+    """
+    if value is None:
+        return True
+    if isinstance(value, float) and math.isnan(value):
+        return True
+    if isinstance(value, str) and value.lower() in ["nan", "null", "none"]:
+        return True
+    return False
+
+
 def _append_indirect_attachments_to_results(
     current_result_dict: Dict[str, Any],
     result_name: str,
@@ -2363,7 +2399,7 @@ def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logge
         for sample_data in sample_data_list:
             if sample_data and isinstance(sample_data, dict) and "usage" in sample_data:
                 usage_data = sample_data["usage"]
-                model_name = sample_data.get("model", "unknown")
+                model_name = sample_data.get("model", "unknown") if usage_data.get("model", "unknown") else "unknown"
                 if model_name not in model_usage_stats:
                     model_usage_stats[model_name] = {
                         "invocation_count": 0,
@@ -2376,10 +2412,18 @@ def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logge
                 model_stats = model_usage_stats[model_name]
                 model_stats["invocation_count"] += 1
                 if isinstance(usage_data, dict):
-                    model_stats["total_tokens"] += usage_data.get("total_tokens", 0)
-                    model_stats["prompt_tokens"] += usage_data.get("prompt_tokens", 0)
-                    model_stats["completion_tokens"] += usage_data.get("completion_tokens", 0)
-                    model_stats["cached_tokens"] += usage_data.get("cached_tokens", 0)
+                    model_stats["total_tokens"] += (
+                        usage_data.get("total_tokens", 0) if usage_data.get("total_tokens", 0) else 0
+                    )
+                    model_stats["prompt_tokens"] += (
+                        usage_data.get("prompt_tokens", 0) if usage_data.get("prompt_tokens", 0) else 0
+                    )
+                    model_stats["completion_tokens"] += (
+                        usage_data.get("completion_tokens", 0) if usage_data.get("completion_tokens", 0) else 0
+                    )
+                    model_stats["cached_tokens"] += (
+                        usage_data.get("cached_tokens", 0) if usage_data.get("cached_tokens", 0) else 0
+                    )
 
     # Convert model usage stats to list format matching EvaluationRunPerModelUsage
     per_model_usage = []