Skip to content

Commit d29dd83

Browse files
authored
Evaluate aggregation robustness (Azure#38367)
* make evaluate aggregation more resiliant against bad inputs * cl * remove accidental test file changes * remove accidental test file changes2 * remove accidental test file changes3 * more useful errors * change exception to warnings
1 parent 3d27b32 commit d29dd83

File tree

5 files changed

+135
-35
lines changed

5 files changed

+135
-35
lines changed

sdk/evaluation/azure-ai-evaluation/CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
- Output of adversarial simulators are of type `JsonLineList` and the helper function `to_eval_qr_json_lines` now outputs context from both user and assistant turns along with `category` if it exists in the conversation
1313
- Fixed an issue where during long-running simulations, API token expires causing "Forbidden" error. Instead, users can now set an environment variable `AZURE_TOKEN_REFRESH_INTERVAL` to refresh the token more frequently to prevent expiration and ensure continuous operation of the simulation.
1414
- Fixed an issue with the `ContentSafetyEvaluator` that caused parallel execution of sub-evaluators to fail. Parallel execution is now enabled by default again, but can still be disabled via the '_parallel' boolean keyword argument during class initialization.
15+
- Fix `evaluate` function not producing aggregated metrics if ANY values to be aggregated were None, NaN, or
16+
otherwise difficult to process. Such values are ignored fully, so the aggregated metric of `[1, 2, 3, NaN]`
17+
would be 2, not 1.5.
1518

1619
### Other Changes
1720
- Refined error messages for serviced-based evaluators and simulators.

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/math.py

Lines changed: 62 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,44 @@
33
# ---------------------------------------------------------
44

55
import math
6-
from typing import List
6+
from typing import List, Callable, Any
77

88
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
99

1010

1111
def list_sum(lst: List[float]) -> float:
12+
"""Given a list of floats, return the sum of the values.
13+
14+
:param lst: A list of floats.
15+
:type lst: List[float]
16+
:return: The sum of the values in the list.
17+
:rtype: float
18+
"""
19+
1220
return sum(lst)
1321

1422

1523
def list_mean(lst: List[float]) -> float:
24+
"""Given a list of floats, calculate the mean of the values.
25+
26+
:param lst: A list of floats.
27+
:type lst: List[float]
28+
:return: The mean of the values in the list.
29+
:rtype: float
30+
"""
31+
1632
return list_sum(lst) / len(lst)
1733

1834

1935
def list_mean_nan_safe(lst: List[float]) -> float:
36+
"""Given a list of floats, remove all nan or None values, then calculate the mean of the remaining values.
37+
38+
:param lst: A list of floats.
39+
:type lst: List[float]
40+
:return: The mean of the values in the list.
41+
:rtype: float
42+
"""
43+
2044
msg = "All score values are NaN. The mean cannot be calculated."
2145
if all(math.isnan(l) for l in lst):
2246
raise EvaluationException(
@@ -26,4 +50,40 @@ def list_mean_nan_safe(lst: List[float]) -> float:
2650
category=ErrorCategory.INVALID_VALUE,
2751
target=ErrorTarget.CONVERSATION,
2852
)
29-
return list_mean([l for l in lst if not math.isnan(l)])
53+
return list_mean([l for l in lst if not is_none_or_nan(l)])
54+
55+
56+
def apply_transform_nan_safe(lst: List[float], transform_fn: Callable[[float], Any]) -> List[Any]:
57+
"""Given a list of floats, remove all nan values, then apply the inputted transform function
58+
to the remaining values, and return the resulting list of outputted values.
59+
60+
:param lst: A list of floats.
61+
:type lst: List[float]
62+
:param transform_fn: A function that produces something when applied to a float.
63+
:type transform_fn: Callable[[float], Any]
64+
:return: A list of the transformed values.
65+
:rtype: List[Any]
66+
"""
67+
68+
msg = "All score values are NaN. The mean cannot be calculated."
69+
if all(math.isnan(l) for l in lst):
70+
raise EvaluationException(
71+
message=msg,
72+
internal_message=msg,
73+
blame=ErrorBlame.USER_ERROR,
74+
category=ErrorCategory.INVALID_VALUE,
75+
target=ErrorTarget.CONVERSATION,
76+
)
77+
return [transform_fn(l) for l in lst if not is_none_or_nan(l)]
78+
79+
80+
def is_none_or_nan(val: float) -> bool:
81+
"""math.isnan raises an error if None is inputted. This is a more robust wrapper.
82+
83+
:param val: The value to check.
84+
:type val: float
85+
:return: Whether the value is None or NaN.
86+
:rtype: bool
87+
"""
88+
89+
return val is None or math.isnan(val)

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# ---------------------------------------------------------
44
import inspect
55
import json
6+
import logging
67
import os
78
import re
89
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
@@ -13,7 +14,7 @@
1314
from promptflow.client import PFClient
1415
from promptflow.entities import Run
1516

16-
from azure.ai.evaluation._common.math import list_sum
17+
from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
1718
from azure.ai.evaluation._common.utils import validate_azure_ai_project
1819
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
1920

@@ -35,6 +36,7 @@
3536
)
3637

3738
TClient = TypeVar("TClient", ProxyClient, CodeClient)
39+
LOGGER = logging.getLogger(__name__)
3840

3941
# For metrics (aggregates) whose metric names intentionally differ from their
4042
# originating column name, usually because the aggregation of the original value
@@ -69,10 +71,11 @@ def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, flo
6971
renamed_cols.append(col)
7072
new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
7173
col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
72-
metric_columns[new_col_name] = round(
73-
list_sum(col_with_numeric_values) / col_with_numeric_values.count(),
74-
2,
75-
)
74+
try:
75+
metric_columns[new_col_name] = round(list_mean_nan_safe(col_with_numeric_values), 2)
76+
except EvaluationException: # only exception that can be cause is all NaN values
77+
msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
78+
LOGGER.warning(msg)
7679

7780
return renamed_cols, metric_columns
7881

@@ -119,11 +122,15 @@ def _aggregate_content_safety_metrics(
119122
for col in content_safety_df.columns:
120123
defect_rate_name = col.replace("_score", "_defect_rate")
121124
col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
122-
defect_rates[defect_rate_name] = round(
123-
list_sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
124-
/ col_with_numeric_values.count(),
125-
2,
126-
)
125+
try:
126+
col_with_boolean_values = apply_transform_nan_safe(
127+
col_with_numeric_values, lambda x: 1 if x >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT else 0
128+
)
129+
defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
130+
except EvaluationException: # only exception that can be cause is all NaN values
131+
msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
132+
LOGGER.warning(msg)
133+
127134
return content_safety_cols, defect_rates
128135

129136

@@ -153,10 +160,11 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
153160
for col in label_df.columns:
154161
defect_rate_name = col.replace("_label", "_defect_rate")
155162
col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
156-
defect_rates[defect_rate_name] = round(
157-
list_sum(col_with_boolean_values) / col_with_boolean_values.count(),
158-
2,
159-
)
163+
try:
164+
defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
165+
except EvaluationException: # only exception that can be cause is all NaN values
166+
msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
167+
LOGGER.warning(msg)
160168
return label_cols, defect_rates
161169

162170

@@ -193,6 +201,9 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
193201
# For rest of metrics, we will calculate mean
194202
df.drop(columns=handled_columns, inplace=True)
195203

204+
# NOTE: nan/None values don't count as as booleans, so boolean columns with
205+
# nan/None values won't have a mean produced from them.
206+
# This is different from label-based known evaluators, which have special handling.
196207
mean_value = df.mean(numeric_only=True)
197208
metrics = mean_value.to_dict()
198209
# Add defect rates back into metrics

sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,7 @@ def test_evaluate_with_groundedness_pro_evaluator(self, project_scope, data_conv
403403
assert "groundedness_pro_label" in per_turn_results.keys()
404404
assert "groundedness_pro_reason" in per_turn_results.keys()
405405

406-
# Check that label is renamed to passsing rate in metrics
406+
# Check that label is renamed to passing rate in metrics
407407
assert "groundedness_pro.groundedness_pro_passing_rate" in convo_metrics.keys()
408408
assert 0 <= convo_metrics.get("groundedness_pro.groundedness_pro_passing_rate") <= 1
409409

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py

Lines changed: 44 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import math
33
import os
44
import pathlib
5+
import numpy as np
56
from unittest.mock import patch
67

78
import pandas as pd
@@ -483,10 +484,10 @@ def test_get_trace_destination(self, mock_validate_trace_destination, mock_proje
483484
def test_content_safety_aggregation(self):
484485
data = {
485486
# 10 zeroes in a list fully written out
486-
"content_safety.violence_score": [0, 0, 1, 2, 5, 5, 6, 7, 9, 6],
487-
"content_safety.sexual_score": [0, 0, 2, 3, 3, 3, 8, 8, 8, 8],
488-
"content_safety.self_harm_score": [0, 0, 0, 0, 1, 1, 1, 1, 7, 7],
489-
"content_safety.hate_unfairness_score": [0, 0, 1, 1, 2, 2, 3, 3, 3, 6],
487+
"content_safety.violence_score": [0, 0, 1, 2, 5, 5, 6, 7, np.nan, None],
488+
"content_safety.sexual_score": [0, 0, 2, 3, 3, 3, 8, 8, np.nan, None],
489+
"content_safety.self_harm_score": [0, 0, 0, 0, 1, 1, 1, 1, np.nan, None],
490+
"content_safety.hate_unfairness_score": [0, 0, 1, 1, 2, 2, 3, 5, 6, 7],
490491
"content_safety.violence": [
491492
"low",
492493
"low",
@@ -514,18 +515,21 @@ def test_content_safety_aggregation(self):
514515
aggregation = _aggregate_metrics(data_df, evaluators)
515516

516517
assert len(aggregation) == 4
517-
assert aggregation["content_safety.violence_defect_rate"] == 0.6
518-
assert aggregation["content_safety.sexual_defect_rate"] == 0.4
519-
assert aggregation["content_safety.self_harm_defect_rate"] == 0.2
520-
assert aggregation["content_safety.hate_unfairness_defect_rate"] == 0.1
518+
assert aggregation["content_safety.violence_defect_rate"] == 0.5
519+
assert aggregation["content_safety.sexual_defect_rate"] == 0.25
520+
assert aggregation["content_safety.self_harm_defect_rate"] == 0.0
521+
assert aggregation["content_safety.hate_unfairness_defect_rate"] == 0.3
522+
523+
no_results = _aggregate_metrics(pd.DataFrame({"content_safety.violence_score": [np.nan, None]}), evaluators)
524+
assert len(no_results) == 0
521525

522526
def test_label_based_aggregation(self):
523527
data = {
524-
"eci.eci_label": [True, False, True, False, True],
528+
"eci.eci_label": [True, True, True, np.nan, None],
525529
"eci.eci_reasoning": ["a", "b", "c", "d", "e"],
526530
"protected_material.protected_material_label": [False, False, False, False, True],
527531
"protected_material.protected_material_reasoning": ["f", "g", "h", "i", "j"],
528-
"unknown.unaccounted_label": [True, False, False, False, True],
532+
"unknown.unaccounted_label": [False, False, False, True, True],
529533
"unknown.unaccounted_reasoning": ["k", "l", "m", "n", "o"],
530534
}
531535
data_df = pd.DataFrame(data)
@@ -540,18 +544,37 @@ def test_label_based_aggregation(self):
540544
assert "protected_material.protected_material_label" not in aggregation
541545
assert aggregation["unknown.unaccounted_label"] == 0.4
542546

543-
assert aggregation["eci.eci_defect_rate"] == 0.6
547+
assert aggregation["eci.eci_defect_rate"] == 1.0
544548
assert aggregation["protected_material.protected_material_defect_rate"] == 0.2
545549
assert "unaccounted_defect_rate" not in aggregation
546550

551+
no_results = _aggregate_metrics(pd.DataFrame({"eci.eci_label": [np.nan, None]}), evaluators)
552+
assert len(no_results) == 0
553+
554+
def test_other_aggregation(self):
555+
data = {
556+
"thing.groundedness_pro_label": [True, False, True, False, np.nan, None],
557+
}
558+
data_df = pd.DataFrame(data)
559+
evaluators = {}
560+
aggregation = _aggregate_metrics(data_df, evaluators)
561+
562+
assert len(aggregation) == 1
563+
assert aggregation["thing.groundedness_pro_passing_rate"] == 0.5
564+
565+
no_results = _aggregate_metrics(pd.DataFrame({"thing.groundedness_pro_label": [np.nan, None]}), {})
566+
assert len(no_results) == 0
567+
547568
def test_general_aggregation(self):
548569
data = {
549-
"thing.metric": [1, 2, 3, 4, 5],
550-
"thing.reasoning": ["a", "b", "c", "d", "e"],
551-
"other_thing.other_meteric": [-1, -2, -3, -4, -5],
552-
"other_thing.other_reasoning": ["f", "g", "h", "i", "j"],
553-
"final_thing.final_metric": [False, False, False, True, True],
554-
"bad_thing.mixed_metric": [0, 1, False, True, True],
570+
"thing.metric": [1, 2, 3, 4, 5, np.nan, None],
571+
"thing.reasoning": ["a", "b", "c", "d", "e", "f", "g"],
572+
"other_thing.other_meteric": [-1, -2, -3, -4, -5, np.nan, None],
573+
"other_thing.other_reasoning": ["f", "g", "h", "i", "j", "i", "j"],
574+
"final_thing.final_metric": [False, False, False, True, True, True, False],
575+
"bad_thing.mixed_metric": [0, 1, False, True, 0.5, True, False],
576+
"bad_thing.boolean_with_nan": [True, False, True, False, True, False, np.nan],
577+
"bad_thing.boolean_with_none": [True, False, True, False, True, False, None],
555578
}
556579
data_df = pd.DataFrame(data)
557580
evaluators = {}
@@ -560,7 +583,10 @@ def test_general_aggregation(self):
560583
assert len(aggregation) == 3
561584
assert aggregation["thing.metric"] == 3
562585
assert aggregation["other_thing.other_meteric"] == -3
563-
assert aggregation["final_thing.final_metric"] == 0.4
586+
assert aggregation["final_thing.final_metric"] == 3 / 7.0
587+
assert "bad_thing.mixed_metric" not in aggregation
588+
assert "bad_thing.boolean_with_nan" not in aggregation
589+
assert "bad_thing.boolean_with_none" not in aggregation
564590

565591
@pytest.mark.parametrize("use_pf_client", [True, False])
566592
def test_optional_inputs_with_data(self, questions_file, questions_answers_basic_file, use_pf_client):

0 commit comments

Comments
 (0)