Evaluate aggregation robustness (Azure#38367)

MilesHolland · web-flow · commit d29dd839d04f · 2024-11-07T11:31:19.000-05:00
* make evaluate aggregation more resiliant against bad inputs

* cl

* remove accidental test file changes

* remove accidental test file changes2

* remove accidental test file changes3

* more useful errors

* change exception to warnings
diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
@@ -12,6 +12,9 @@
 - Output of adversarial simulators are of type `JsonLineList` and the helper function `to_eval_qr_json_lines` now outputs context from both user and assistant turns along with `category` if it exists in the conversation
 - Fixed an issue where during long-running simulations, API token expires causing "Forbidden" error. Instead, users can now set an environment variable `AZURE_TOKEN_REFRESH_INTERVAL` to refresh the token more frequently to prevent expiration and ensure continuous operation of the simulation.
 - Fixed an issue with the `ContentSafetyEvaluator` that caused parallel execution of sub-evaluators to fail. Parallel execution is now enabled by default again, but can still be disabled via the '_parallel' boolean keyword argument during class initialization.
+- Fix `evaluate` function not producing aggregated metrics if ANY values to be aggregated were None, NaN, or
+otherwise difficult to process. Such values are ignored fully, so the aggregated metric of `[1, 2, 3, NaN]`
+would be 2, not 1.5.
 
 ### Other Changes
 - Refined error messages for serviced-based evaluators and simulators.
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/math.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/math.py
@@ -3,20 +3,44 @@
 # ---------------------------------------------------------
 
 import math
-from typing import List
+from typing import List, Callable, Any
 
 from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
 
 
 def list_sum(lst: List[float]) -> float:
+    """Given a list of floats, return the sum of the values.
+
+    :param lst: A list of floats.
+    :type lst: List[float]
+    :return: The sum of the values in the list.
+    :rtype: float
+    """
+
     return sum(lst)
 
 
 def list_mean(lst: List[float]) -> float:
+    """Given a list of floats, calculate the mean of the values.
+
+    :param lst: A list of floats.
+    :type lst: List[float]
+    :return: The mean of the values in the list.
+    :rtype: float
+    """
+
     return list_sum(lst) / len(lst)
 
 
 def list_mean_nan_safe(lst: List[float]) -> float:
+    """Given a list of floats, remove all nan or None values, then calculate the mean of the remaining values.
+
+    :param lst: A list of floats.
+    :type lst: List[float]
+    :return: The mean of the values in the list.
+    :rtype: float
+    """
+
     msg = "All score values are NaN. The mean cannot be calculated."
     if all(math.isnan(l) for l in lst):
         raise EvaluationException(
@@ -26,4 +50,40 @@ def list_mean_nan_safe(lst: List[float]) -> float:
             category=ErrorCategory.INVALID_VALUE,
             target=ErrorTarget.CONVERSATION,
         )
-    return list_mean([l for l in lst if not math.isnan(l)])
+    return list_mean([l for l in lst if not is_none_or_nan(l)])
+
+
+def apply_transform_nan_safe(lst: List[float], transform_fn: Callable[[float], Any]) -> List[Any]:
+    """Given a list of floats, remove all nan values, then apply the inputted transform function
+    to the remaining values, and return the resulting list of outputted values.
+
+    :param lst: A list of floats.
+    :type lst: List[float]
+    :param transform_fn: A function that produces something when applied to a float.
+    :type transform_fn: Callable[[float], Any]
+    :return: A list of the transformed values.
+    :rtype: List[Any]
+    """
+
+    msg = "All score values are NaN. The mean cannot be calculated."
+    if all(math.isnan(l) for l in lst):
+        raise EvaluationException(
+            message=msg,
+            internal_message=msg,
+            blame=ErrorBlame.USER_ERROR,
+            category=ErrorCategory.INVALID_VALUE,
+            target=ErrorTarget.CONVERSATION,
+        )
+    return [transform_fn(l) for l in lst if not is_none_or_nan(l)]
+
+
+def is_none_or_nan(val: float) -> bool:
+    """math.isnan raises an error if None is inputted. This is a more robust wrapper.
+
+    :param val: The value to check.
+    :type val: float
+    :return: Whether the value is None or NaN.
+    :rtype: bool
+    """
+
+    return val is None or math.isnan(val)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
@@ -3,6 +3,7 @@
 # ---------------------------------------------------------
 import inspect
 import json
+import logging
 import os
 import re
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
@@ -13,7 +14,7 @@
 from promptflow.client import PFClient
 from promptflow.entities import Run
 
-from azure.ai.evaluation._common.math import list_sum
+from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
 from azure.ai.evaluation._common.utils import validate_azure_ai_project
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 
@@ -35,6 +36,7 @@
 )
 
 TClient = TypeVar("TClient", ProxyClient, CodeClient)
+LOGGER = logging.getLogger(__name__)
 
 # For metrics (aggregates) whose metric names intentionally differ from their
 # originating column name, usually because the aggregation of the original value
@@ -69,10 +71,11 @@ def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, flo
             renamed_cols.append(col)
             new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
             col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
-            metric_columns[new_col_name] = round(
-                list_sum(col_with_numeric_values) / col_with_numeric_values.count(),
-                2,
-            )
+            try:
+                metric_columns[new_col_name] = round(list_mean_nan_safe(col_with_numeric_values), 2)
+            except EvaluationException:  # only exception that can be cause is all NaN values
+                msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
+                LOGGER.warning(msg)
 
     return renamed_cols, metric_columns
 
@@ -119,11 +122,15 @@ def _aggregate_content_safety_metrics(
     for col in content_safety_df.columns:
         defect_rate_name = col.replace("_score", "_defect_rate")
         col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
-        defect_rates[defect_rate_name] = round(
-            list_sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
-            / col_with_numeric_values.count(),
-            2,
-        )
+        try:
+            col_with_boolean_values = apply_transform_nan_safe(
+                col_with_numeric_values, lambda x: 1 if x >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT else 0
+            )
+            defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
+        except EvaluationException:  # only exception that can be cause is all NaN values
+            msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
+            LOGGER.warning(msg)
+
     return content_safety_cols, defect_rates
 
 
@@ -153,10 +160,11 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
     for col in label_df.columns:
         defect_rate_name = col.replace("_label", "_defect_rate")
         col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
-        defect_rates[defect_rate_name] = round(
-            list_sum(col_with_boolean_values) / col_with_boolean_values.count(),
-            2,
-        )
+        try:
+            defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
+        except EvaluationException:  # only exception that can be cause is all NaN values
+            msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
+            LOGGER.warning(msg)
     return label_cols, defect_rates
 
 
@@ -193,6 +201,9 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
     # For rest of metrics, we will calculate mean
     df.drop(columns=handled_columns, inplace=True)
 
+    # NOTE: nan/None values don't count as as booleans, so boolean columns with
+    # nan/None values won't have a mean produced from them.
+    # This is different from label-based known evaluators, which have special handling.
     mean_value = df.mean(numeric_only=True)
     metrics = mean_value.to_dict()
     # Add defect rates back into metrics
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py
@@ -403,7 +403,7 @@ def test_evaluate_with_groundedness_pro_evaluator(self, project_scope, data_conv
         assert "groundedness_pro_label" in per_turn_results.keys()
         assert "groundedness_pro_reason" in per_turn_results.keys()
 
-        # Check that label is renamed to passsing rate in metrics
+        # Check that label is renamed to passing rate in metrics
         assert "groundedness_pro.groundedness_pro_passing_rate" in convo_metrics.keys()
         assert 0 <= convo_metrics.get("groundedness_pro.groundedness_pro_passing_rate") <= 1
 
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
@@ -2,6 +2,7 @@
 import math
 import os
 import pathlib
+import numpy as np
 from unittest.mock import patch
 
 import pandas as pd
@@ -483,10 +484,10 @@ def test_get_trace_destination(self, mock_validate_trace_destination, mock_proje
     def test_content_safety_aggregation(self):
         data = {
             # 10 zeroes in a list fully written out
-            "content_safety.violence_score": [0, 0, 1, 2, 5, 5, 6, 7, 9, 6],
-            "content_safety.sexual_score": [0, 0, 2, 3, 3, 3, 8, 8, 8, 8],
-            "content_safety.self_harm_score": [0, 0, 0, 0, 1, 1, 1, 1, 7, 7],
-            "content_safety.hate_unfairness_score": [0, 0, 1, 1, 2, 2, 3, 3, 3, 6],
+            "content_safety.violence_score": [0, 0, 1, 2, 5, 5, 6, 7, np.nan, None],
+            "content_safety.sexual_score": [0, 0, 2, 3, 3, 3, 8, 8, np.nan, None],
+            "content_safety.self_harm_score": [0, 0, 0, 0, 1, 1, 1, 1, np.nan, None],
+            "content_safety.hate_unfairness_score": [0, 0, 1, 1, 2, 2, 3, 5, 6, 7],
             "content_safety.violence": [
                 "low",
                 "low",
@@ -514,18 +515,21 @@ def test_content_safety_aggregation(self):
         aggregation = _aggregate_metrics(data_df, evaluators)
 
         assert len(aggregation) == 4
-        assert aggregation["content_safety.violence_defect_rate"] == 0.6
-        assert aggregation["content_safety.sexual_defect_rate"] == 0.4
-        assert aggregation["content_safety.self_harm_defect_rate"] == 0.2
-        assert aggregation["content_safety.hate_unfairness_defect_rate"] == 0.1
+        assert aggregation["content_safety.violence_defect_rate"] == 0.5
+        assert aggregation["content_safety.sexual_defect_rate"] == 0.25
+        assert aggregation["content_safety.self_harm_defect_rate"] == 0.0
+        assert aggregation["content_safety.hate_unfairness_defect_rate"] == 0.3
+
+        no_results = _aggregate_metrics(pd.DataFrame({"content_safety.violence_score": [np.nan, None]}), evaluators)
+        assert len(no_results) == 0
 
     def test_label_based_aggregation(self):
         data = {
-            "eci.eci_label": [True, False, True, False, True],
+            "eci.eci_label": [True, True, True, np.nan, None],
             "eci.eci_reasoning": ["a", "b", "c", "d", "e"],
             "protected_material.protected_material_label": [False, False, False, False, True],
             "protected_material.protected_material_reasoning": ["f", "g", "h", "i", "j"],
-            "unknown.unaccounted_label": [True, False, False, False, True],
+            "unknown.unaccounted_label": [False, False, False, True, True],
             "unknown.unaccounted_reasoning": ["k", "l", "m", "n", "o"],
         }
         data_df = pd.DataFrame(data)
@@ -540,18 +544,37 @@ def test_label_based_aggregation(self):
         assert "protected_material.protected_material_label" not in aggregation
         assert aggregation["unknown.unaccounted_label"] == 0.4
 
-        assert aggregation["eci.eci_defect_rate"] == 0.6
+        assert aggregation["eci.eci_defect_rate"] == 1.0
         assert aggregation["protected_material.protected_material_defect_rate"] == 0.2
         assert "unaccounted_defect_rate" not in aggregation
 
+        no_results = _aggregate_metrics(pd.DataFrame({"eci.eci_label": [np.nan, None]}), evaluators)
+        assert len(no_results) == 0
+
+    def test_other_aggregation(self):
+        data = {
+            "thing.groundedness_pro_label": [True, False, True, False, np.nan, None],
+        }
+        data_df = pd.DataFrame(data)
+        evaluators = {}
+        aggregation = _aggregate_metrics(data_df, evaluators)
+
+        assert len(aggregation) == 1
+        assert aggregation["thing.groundedness_pro_passing_rate"] == 0.5
+
+        no_results = _aggregate_metrics(pd.DataFrame({"thing.groundedness_pro_label": [np.nan, None]}), {})
+        assert len(no_results) == 0
+
     def test_general_aggregation(self):
         data = {
-            "thing.metric": [1, 2, 3, 4, 5],
-            "thing.reasoning": ["a", "b", "c", "d", "e"],
-            "other_thing.other_meteric": [-1, -2, -3, -4, -5],
-            "other_thing.other_reasoning": ["f", "g", "h", "i", "j"],
-            "final_thing.final_metric": [False, False, False, True, True],
-            "bad_thing.mixed_metric": [0, 1, False, True, True],
+            "thing.metric": [1, 2, 3, 4, 5, np.nan, None],
+            "thing.reasoning": ["a", "b", "c", "d", "e", "f", "g"],
+            "other_thing.other_meteric": [-1, -2, -3, -4, -5, np.nan, None],
+            "other_thing.other_reasoning": ["f", "g", "h", "i", "j", "i", "j"],
+            "final_thing.final_metric": [False, False, False, True, True, True, False],
+            "bad_thing.mixed_metric": [0, 1, False, True, 0.5, True, False],
+            "bad_thing.boolean_with_nan": [True, False, True, False, True, False, np.nan],
+            "bad_thing.boolean_with_none": [True, False, True, False, True, False, None],
         }
         data_df = pd.DataFrame(data)
         evaluators = {}
@@ -560,7 +583,10 @@ def test_general_aggregation(self):
         assert len(aggregation) == 3
         assert aggregation["thing.metric"] == 3
         assert aggregation["other_thing.other_meteric"] == -3
-        assert aggregation["final_thing.final_metric"] == 0.4
+        assert aggregation["final_thing.final_metric"] == 3 / 7.0
+        assert "bad_thing.mixed_metric" not in aggregation
+        assert "bad_thing.boolean_with_nan" not in aggregation
+        assert "bad_thing.boolean_with_none" not in aggregation
 
     @pytest.mark.parametrize("use_pf_client", [True, False])
     def test_optional_inputs_with_data(self, questions_file, questions_answers_basic_file, use_pf_client):