Skip to content

Commit f805a5b

Browse files
authored
Fix result converter bug: handle null value for evaluation summary calculation (#43753)
* update * rename * run black * fix result counts * update * Fix bug * run black * fix bug
1 parent 0320376 commit f805a5b

File tree

1 file changed

+63
-19
lines changed
  • sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate

1 file changed

+63
-19
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py

Lines changed: 63 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2137,20 +2137,38 @@ def _convert_results_to_aoai_evaluation_results(
21372137
# Create result object for this criteria
21382138
metrics = testing_criteria_name_types_metrics.get(criteria_name, {}).get("metrics", [])
21392139
for metric in metrics:
2140-
result_obj = {
2141-
"type": testing_criteria_name_types_metrics.get(criteria_name, {}).get(
2142-
"type", "azure_ai_evaluator"
2143-
),
2144-
"name": criteria_name, # Use criteria name as name
2145-
"metric": metric if metric is not None else criteria_name, # Use criteria name as metric
2146-
"score": None,
2147-
"label": None,
2148-
"reason": None,
2149-
"threshold": None,
2150-
"passed": None,
2151-
"sample": sample,
2152-
}
2153-
run_output_results.append(result_obj)
2140+
should_add_error_summary = True
2141+
for result in run_output_results:
2142+
if result.get("name", None) == criteria_name and result.get("metric", None) == metric:
2143+
rs_score = result.get("score", None)
2144+
rs_threshold = result.get("threshold", None)
2145+
rs_label = result.get("label", None)
2146+
rs_reason = result.get("reason", None)
2147+
if (
2148+
_is_none_or_nan(rs_score)
2149+
and _is_none_or_nan(rs_threshold)
2150+
and _is_none_or_nan(rs_label)
2151+
and _is_none_or_nan(rs_reason)
2152+
):
2153+
run_output_results.remove(result)
2154+
else:
2155+
should_add_error_summary = False
2156+
break # Skip if already have result for this criteria and metric
2157+
if should_add_error_summary:
2158+
result_obj = {
2159+
"type": testing_criteria_name_types_metrics.get(criteria_name, {}).get(
2160+
"type", "azure_ai_evaluator"
2161+
),
2162+
"name": criteria_name, # Use criteria name as name
2163+
"metric": metric if metric is not None else criteria_name, # Use criteria name as metric
2164+
"score": None,
2165+
"label": None,
2166+
"reason": None,
2167+
"threshold": None,
2168+
"passed": None,
2169+
"sample": sample,
2170+
}
2171+
run_output_results.append(result_obj)
21542172

21552173
# Create RunOutputItem structure
21562174
run_output_item = {
@@ -2182,6 +2200,24 @@ def _convert_results_to_aoai_evaluation_results(
21822200
)
21832201

21842202

2203+
def _is_none_or_nan(value: Any) -> bool:
2204+
"""
2205+
Check if a value is None or NaN.
2206+
2207+
:param value: The value to check
2208+
:type value: Any
2209+
:return: True if the value is None or NaN, False otherwise
2210+
:rtype: bool
2211+
"""
2212+
if value is None:
2213+
return True
2214+
if isinstance(value, float) and math.isnan(value):
2215+
return True
2216+
if isinstance(value, str) and value.lower() in ["nan", "null", "none"]:
2217+
return True
2218+
return False
2219+
2220+
21852221
def _append_indirect_attachments_to_results(
21862222
current_result_dict: Dict[str, Any],
21872223
result_name: str,
@@ -2363,7 +2399,7 @@ def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logge
23632399
for sample_data in sample_data_list:
23642400
if sample_data and isinstance(sample_data, dict) and "usage" in sample_data:
23652401
usage_data = sample_data["usage"]
2366-
model_name = sample_data.get("model", "unknown")
2402+
model_name = sample_data.get("model", "unknown") if usage_data.get("model", "unknown") else "unknown"
23672403
if model_name not in model_usage_stats:
23682404
model_usage_stats[model_name] = {
23692405
"invocation_count": 0,
@@ -2376,10 +2412,18 @@ def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logge
23762412
model_stats = model_usage_stats[model_name]
23772413
model_stats["invocation_count"] += 1
23782414
if isinstance(usage_data, dict):
2379-
model_stats["total_tokens"] += usage_data.get("total_tokens", 0)
2380-
model_stats["prompt_tokens"] += usage_data.get("prompt_tokens", 0)
2381-
model_stats["completion_tokens"] += usage_data.get("completion_tokens", 0)
2382-
model_stats["cached_tokens"] += usage_data.get("cached_tokens", 0)
2415+
model_stats["total_tokens"] += (
2416+
usage_data.get("total_tokens", 0) if usage_data.get("total_tokens", 0) else 0
2417+
)
2418+
model_stats["prompt_tokens"] += (
2419+
usage_data.get("prompt_tokens", 0) if usage_data.get("prompt_tokens", 0) else 0
2420+
)
2421+
model_stats["completion_tokens"] += (
2422+
usage_data.get("completion_tokens", 0) if usage_data.get("completion_tokens", 0) else 0
2423+
)
2424+
model_stats["cached_tokens"] += (
2425+
usage_data.get("cached_tokens", 0) if usage_data.get("cached_tokens", 0) else 0
2426+
)
23832427

23842428
# Convert model usage stats to list format matching EvaluationRunPerModelUsage
23852429
per_model_usage = []

0 commit comments

Comments
 (0)