@@ -80,8 +80,7 @@ async def create_evaluation(
8080
8181 if not evaluation_name :
8282 evaluation_name = f"eval_{ datetime .now (timezone .utc ).strftime ('%Y%m%d_%H%M%S' )} _{ flow_id } _{ dataset_name } "
83-
84- logger .info (f"Creating evaluation: { evaluation_name } " )
83+ logger .info (f"Use generated evaluation name: { evaluation_name } " )
8584
8685 if not instance_ids :
8786 instance_ids = self .get_dataset_instance_ids (dataset_name )
@@ -179,6 +178,80 @@ async def clone_evaluation(self, evaluation_name: str) -> Evaluation:
179178 instance_ids = instance_ids ,
180179 )
181180
181+ async def add_instances_to_evaluation (self , evaluation_name : str , instance_ids : list [str ]) -> Evaluation :
182+ """Add new instances to an existing evaluation."""
183+ evaluation = await self ._load_evaluation (evaluation_name )
184+ if not evaluation :
185+ raise ValueError (f"Evaluation { evaluation_name } not found" )
186+
187+ # Find existing instance IDs
188+ existing_instance_ids = {instance .instance_id for instance in evaluation .instances }
189+
190+ # Filter out instances that already exist
191+ new_instance_ids = [instance_id for instance_id in instance_ids if instance_id not in existing_instance_ids ]
192+
193+ if not new_instance_ids :
194+ logger .info (f"All provided instances already exist in evaluation { evaluation_name } " )
195+ return evaluation
196+
197+ logger .info (f"Adding { len (new_instance_ids )} new instances to evaluation { evaluation_name } " )
198+
199+ # Create new EvaluationInstance objects
200+ new_instances = [EvaluationInstance (instance_id = instance_id ) for instance_id in new_instance_ids ]
201+
202+ # Add them to the evaluation
203+ evaluation .instances .extend (new_instances )
204+
205+ # Create trajectories for the new instances
206+ for instance in new_instances :
207+ await self ._create_trajectory (evaluation , instance )
208+
209+ # Save the updated evaluation
210+ await self ._save_evaluation (evaluation )
211+
212+ logger .info (f"Successfully added { len (new_instances )} instances to evaluation { evaluation_name } " )
213+ return evaluation
214+
215+ async def add_dataset_to_evaluation (self , evaluation_name : str , dataset_name : str ) -> Evaluation :
216+ """Add all instances from a dataset to an existing evaluation."""
217+ evaluation = await self ._load_evaluation (evaluation_name )
218+ if not evaluation :
219+ raise ValueError (f"Evaluation { evaluation_name } not found" )
220+
221+ try :
222+ # Get all instance IDs from the dataset
223+ dataset_instance_ids = self .get_dataset_instance_ids (dataset_name )
224+ except ValueError as e :
225+ raise ValueError (f"Dataset { dataset_name } not found: { str (e )} " )
226+
227+ # Find existing instance IDs
228+ existing_instance_ids = {instance .instance_id for instance in evaluation .instances }
229+
230+ # Filter out instances that already exist
231+ new_instance_ids = [instance_id for instance_id in dataset_instance_ids if instance_id not in existing_instance_ids ]
232+
233+ if not new_instance_ids :
234+ logger .info (f"All instances from dataset { dataset_name } already exist in evaluation { evaluation_name } " )
235+ return evaluation
236+
237+ logger .info (f"Adding { len (new_instance_ids )} instances from dataset { dataset_name } to evaluation { evaluation_name } " )
238+
239+ # Create new EvaluationInstance objects
240+ new_instances = [EvaluationInstance (instance_id = instance_id ) for instance_id in new_instance_ids ]
241+
242+ # Add them to the evaluation
243+ evaluation .instances .extend (new_instances )
244+
245+ # Create trajectories for the new instances
246+ for instance in new_instances :
247+ await self ._create_trajectory (evaluation , instance )
248+
249+ # Save the updated evaluation
250+ await self ._save_evaluation (evaluation )
251+
252+ logger .info (f"Successfully added { len (new_instances )} instances from dataset { dataset_name } to evaluation { evaluation_name } " )
253+ return evaluation
254+
182255 @tracer .start_as_current_span ("EvaluationManager.start_evaluation" )
183256 async def start_evaluation (self , evaluation_name : str ) -> Evaluation :
184257 """Start an evaluation by running all instances."""
@@ -396,7 +469,8 @@ async def _process_trajectory_results(
396469 project_id = evaluation .evaluation_name ,
397470 trajectory_id = instance .instance_id ,
398471 ):
399- logger .warning (f"Instance { instance .instance_id } not found in storage { self .storage } " )
472+ logger .warning (f"Instance { instance .instance_id } not found in storage { self .storage } , will create it" )
473+ await self ._create_trajectory (evaluation , instance )
400474 return instance
401475
402476 try :
@@ -416,24 +490,31 @@ async def _process_trajectory_results(
416490
417491 if node .reward :
418492 instance .reward = node .reward .value
493+
494+ if len (node .get_all_nodes ()) > 1 :
495+ # TODO: Second created nod is the best indication on start time...
496+ instance .started_at = node .get_all_nodes ()[1 ].timestamp
497+ instance .completed_at = flow .root .get_all_nodes ()[- 1 ].timestamp
419498
420499 logger .debug (f"Instance { instance .instance_id } flow is finished: { flow .is_finished ()} " )
421500 if flow .is_finished ():
422501 instance .execution_status = ExecutionStatus .COMPLETED
423502 else :
424503 instance .execution_status = ExecutionStatus .CREATED
425504
426- if node .evaluation_result and node .evaluation_result .resolved is not None :
427- # Set resolution status based on evaluation
428- instance .set_resolution (node .evaluation_result .resolved )
429- logger .info (f"Instance { instance .instance_id } resolution: { instance .resolution_status } " )
505+ if node .evaluation_result :
506+ if node .evaluation_result .resolved is not None :
507+ # Set resolution status based on evaluation
508+ instance .set_resolution (node .evaluation_result .resolved )
509+ logger .info (f"Instance { instance .instance_id } resolution: { instance .resolution_status } " )
430510
431-
432- if node .evaluation_result and node .evaluation_result .details :
433- p2p_failues = node .evaluation_result .details .get ("tests_status" , {}).get ("PASS_TO_PASS" , {}).get ("failure" , [])
434- logger .info (f"Instance { instance .instance_id } p2p failures: { p2p_failues } " )
435- if p2p_failues and not "p2p_failures" in instance .issues :
436- instance .issues .append ("p2p_failures" )
511+ if node .evaluation_result .details :
512+ p2p_failues = node .evaluation_result .details .get ("tests_status" , {}).get ("PASS_TO_PASS" , {}).get ("failure" , [])
513+ if p2p_failues and not "p2p_failures" in instance .issues :
514+ instance .issues .append ("p2p_failures" )
515+
516+ logger .info (f"Instance { instance .instance_id } evaluated at: { node .evaluation_result .end_time } " )
517+ instance .evaluated_at = node .evaluation_result .end_time
437518
438519 if instance .resolution_status != ResolutionStatus .RESOLVED :
439520 logger .debug (f"Instance { instance .instance_id } and node { node .node_id } has no evaluation result" )
@@ -729,14 +810,18 @@ async def start_instance(self, evaluation_name: str, instance_id: str) -> Evalua
729810
730811 flow = await self ._flow_manager .get_flow (evaluation .evaluation_name , instance .instance_id )
731812 should_save = False
813+ evaluated = False
732814 for leaf_node in flow .root .get_leaf_nodes ():
733815 if leaf_node .error :
734816 logger .info (f"Resetting node { leaf_node .node_id } with error" )
735817 leaf_node .reset ()
736818 should_save = True
819+
820+ if leaf_node .evaluation_result :
821+ evaluated = True
737822
738823 # Skip if already completed and evaluated
739- if instance .is_finished () and instance . is_evaluated () :
824+ if instance .is_finished () and evaluated :
740825 finish_reason = flow .is_finished ()
741826 if finish_reason :
742827 logger .info (f"Instance { instance_id } is already completed and evaluated, skipping: { finish_reason } " )
@@ -881,10 +966,11 @@ async def get_evaluation_stats(self, evaluation_name: str) -> EvaluationStats:
881966 if not evaluation :
882967 raise ValueError (f"Evaluation { evaluation_name } not found" )
883968
969+ instances = [i for i in evaluation .instances if i .is_finished ()]
884970 # Calculate basic metrics
885- total_instances = len (evaluation . instances )
886- resolved_instances = sum (1 for instance in evaluation . instances if instance .resolved is True )
887- failed_instances = sum (1 for instance in evaluation . instances if instance .resolved is False )
971+ total_instances = len (instances )
972+ resolved_instances = sum (1 for instance in instances if instance .resolved is True )
973+ failed_instances = sum (1 for instance in instances if instance .resolved is False )
888974 success_rate = (resolved_instances / total_instances ) * 100 if total_instances > 0 else 0
889975
890976 # Calculate cost and token metrics
@@ -896,7 +982,7 @@ async def get_evaluation_stats(self, evaluation_name: str) -> EvaluationStats:
896982 iteration_values = []
897983 cost_values = []
898984
899- for instance in evaluation . instances :
985+ for instance in instances :
900986 if instance .usage :
901987 cost = instance .usage .completion_cost or 0
902988 total_cost += cost
@@ -958,7 +1044,7 @@ async def get_evaluation_stats(self, evaluation_name: str) -> EvaluationStats:
9581044
9591045 # Per-repo statistics
9601046 repo_data = {}
961- for instance in evaluation . instances :
1047+ for instance in instances :
9621048 try :
9631049 swebench_instance = get_swebench_instance (instance .instance_id )
9641050 repo = swebench_instance ["repo" ]
0 commit comments