@@ -208,7 +208,7 @@ def _get_single_run_results(
208208 if run_results .status != "completed" :
209209 raise EvaluationException (
210210 message = f"AOAI evaluation run { run_info ['eval_group_id' ]} /{ run_info ['eval_run_id' ]} "
211- + " failed with status {run_results.status}." ,
211+ + f " failed with status { run_results .status } ." ,
212212 blame = ErrorBlame .UNKNOWN ,
213213 category = ErrorCategory .FAILED_EXECUTION ,
214214 target = ErrorTarget .AOAI_GRADER ,
@@ -240,8 +240,12 @@ def _get_single_run_results(
240240 eval_id = run_info ["eval_group_id" ],
241241 run_id = run_info ["eval_run_id" ]
242242 )
243- listed_results = {}
243+ listed_results = {"index" : []}
244+ # raw data has no order guarantees, we need to sort them by their
245+ # datasource_item_id
244246 for row_result in raw_list_results .data :
247+ # Add the datasource_item_id for later sorting
248+ listed_results ["index" ].append (row_result .datasource_item_id )
245249 for single_grader_row_result in row_result .results :
246250 grader_name = run_info ["grader_name_map" ][single_grader_row_result ["name" ]]
247251 for name , value in single_grader_row_result .items ():
@@ -251,14 +255,19 @@ def _get_single_run_results(
251255 # create a `_result` column for each grader
252256 result_column_name = f"outputs.{ grader_name } .{ grader_name } _result"
253257 if len (result_column_name ) < 50 : #TODO: is this the limit? Should we keep "passed"?
254- listed_results [result_column_name ] = EVALUATION_PASS_FAIL_MAPPING [value ]
258+ if (result_column_name not in listed_results ):
259+ listed_results [result_column_name ] = []
260+ listed_results [result_column_name ].append (EVALUATION_PASS_FAIL_MAPPING [value ])
255261
256262 formatted_column_name = f"outputs.{ grader_name } .{ name } "
257263 if (formatted_column_name not in listed_results ):
258264 listed_results [formatted_column_name ] = []
259- listed_results [f"outputs. { grader_name } . { name } " ].append (value )
265+ listed_results [formatted_column_name ].append (value )
260266 output_df = pd .DataFrame (listed_results )
261-
267+ # sort by index
268+ output_df = output_df .sort_values ('index' , ascending = [True ])
269+ # remove index column
270+ output_df .drop (columns = ["index" ], inplace = True )
262271 return output_df , run_metrics
263272
264273
0 commit comments