@@ -2137,20 +2137,38 @@ def _convert_results_to_aoai_evaluation_results(
21372137 # Create result object for this criteria
21382138 metrics = testing_criteria_name_types_metrics .get (criteria_name , {}).get ("metrics" , [])
21392139 for metric in metrics :
2140- result_obj = {
2141- "type" : testing_criteria_name_types_metrics .get (criteria_name , {}).get (
2142- "type" , "azure_ai_evaluator"
2143- ),
2144- "name" : criteria_name , # Use criteria name as name
2145- "metric" : metric if metric is not None else criteria_name , # Use criteria name as metric
2146- "score" : None ,
2147- "label" : None ,
2148- "reason" : None ,
2149- "threshold" : None ,
2150- "passed" : None ,
2151- "sample" : sample ,
2152- }
2153- run_output_results .append (result_obj )
2140+ should_add_error_summary = True
2141+ for result in run_output_results :
2142+ if result .get ("name" , None ) == criteria_name and result .get ("metric" , None ) == metric :
2143+ rs_score = result .get ("score" , None )
2144+ rs_threshold = result .get ("threshold" , None )
2145+ rs_label = result .get ("label" , None )
2146+ rs_reason = result .get ("reason" , None )
2147+ if (
2148+ _is_none_or_nan (rs_score )
2149+ and _is_none_or_nan (rs_threshold )
2150+ and _is_none_or_nan (rs_label )
2151+ and _is_none_or_nan (rs_reason )
2152+ ):
2153+ run_output_results .remove (result )
2154+ else :
2155+ should_add_error_summary = False
2156+ break # Skip if already have result for this criteria and metric
2157+ if should_add_error_summary :
2158+ result_obj = {
2159+ "type" : testing_criteria_name_types_metrics .get (criteria_name , {}).get (
2160+ "type" , "azure_ai_evaluator"
2161+ ),
2162+ "name" : criteria_name , # Use criteria name as name
2163+ "metric" : metric if metric is not None else criteria_name , # Use criteria name as metric
2164+ "score" : None ,
2165+ "label" : None ,
2166+ "reason" : None ,
2167+ "threshold" : None ,
2168+ "passed" : None ,
2169+ "sample" : sample ,
2170+ }
2171+ run_output_results .append (result_obj )
21542172
21552173 # Create RunOutputItem structure
21562174 run_output_item = {
@@ -2182,6 +2200,24 @@ def _convert_results_to_aoai_evaluation_results(
21822200 )
21832201
21842202
2203+ def _is_none_or_nan (value : Any ) -> bool :
2204+ """
2205+ Check if a value is None or NaN.
2206+
2207+ :param value: The value to check
2208+ :type value: Any
2209+ :return: True if the value is None or NaN, False otherwise
2210+ :rtype: bool
2211+ """
2212+ if value is None :
2213+ return True
2214+ if isinstance (value , float ) and math .isnan (value ):
2215+ return True
2216+ if isinstance (value , str ) and value .lower () in ["nan" , "null" , "none" ]:
2217+ return True
2218+ return False
2219+
2220+
21852221def _append_indirect_attachments_to_results (
21862222 current_result_dict : Dict [str , Any ],
21872223 result_name : str ,
@@ -2363,7 +2399,7 @@ def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logge
23632399 for sample_data in sample_data_list :
23642400 if sample_data and isinstance (sample_data , dict ) and "usage" in sample_data :
23652401 usage_data = sample_data ["usage" ]
2366- model_name = sample_data .get ("model" , "unknown" )
2402+ model_name = sample_data .get ("model" , "unknown" ) if usage_data . get ( "model" , "unknown" ) else "unknown"
23672403 if model_name not in model_usage_stats :
23682404 model_usage_stats [model_name ] = {
23692405 "invocation_count" : 0 ,
@@ -2376,10 +2412,18 @@ def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logge
23762412 model_stats = model_usage_stats [model_name ]
23772413 model_stats ["invocation_count" ] += 1
23782414 if isinstance (usage_data , dict ):
2379- model_stats ["total_tokens" ] += usage_data .get ("total_tokens" , 0 )
2380- model_stats ["prompt_tokens" ] += usage_data .get ("prompt_tokens" , 0 )
2381- model_stats ["completion_tokens" ] += usage_data .get ("completion_tokens" , 0 )
2382- model_stats ["cached_tokens" ] += usage_data .get ("cached_tokens" , 0 )
2415+ model_stats ["total_tokens" ] += (
2416+ usage_data .get ("total_tokens" , 0 ) if usage_data .get ("total_tokens" , 0 ) else 0
2417+ )
2418+ model_stats ["prompt_tokens" ] += (
2419+ usage_data .get ("prompt_tokens" , 0 ) if usage_data .get ("prompt_tokens" , 0 ) else 0
2420+ )
2421+ model_stats ["completion_tokens" ] += (
2422+ usage_data .get ("completion_tokens" , 0 ) if usage_data .get ("completion_tokens" , 0 ) else 0
2423+ )
2424+ model_stats ["cached_tokens" ] += (
2425+ usage_data .get ("cached_tokens" , 0 ) if usage_data .get ("cached_tokens" , 0 ) else 0
2426+ )
23832427
23842428 # Convert model usage stats to list format matching EvaluationRunPerModelUsage
23852429 per_model_usage = []
0 commit comments