aorwall
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.moatless/flows/swebench_claude.json‎
Lines changed: 3 additions & 1 deletion b/‎.moatless/flows/swebench_claude.json‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎moatless/api/swebench/api.py‎
Lines changed: 38 additions & 3 deletions b/‎moatless/api/swebench/api.py‎
Lines changed: 38 additions & 3 deletions
diff --git a/‎moatless/api/swebench/schema.py‎
Lines changed: 12 additions & 0 deletions b/‎moatless/api/swebench/schema.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎moatless/api/trajectories/api.py‎
Lines changed: 5 additions & 25 deletions b/‎moatless/api/trajectories/api.py‎
Lines changed: 5 additions & 25 deletions
diff --git a/‎moatless/evaluation/manager.py‎
Lines changed: 105 additions & 19 deletions b/‎moatless/evaluation/manager.py‎
Lines changed: 105 additions & 19 deletions
@@ -181,4 +181,5 @@ instances
 
 playground
 CLAUDE.md
-hej2
+hej2
+.DS_Store
@@ -1,6 +1,8 @@
 {
   "id": "swebench_claude",
   "description": "SWE-bench flow using structured tool calling with Claude Sonnet 4 and built-in reasoning capabilities.  (Copy)",
+  "project_id": null,
+  "trajectory_id": null,
   "agent": {
     "agent_id": null,
     "model_id": "claude-sonnet-4-20250514-thinking",
@@ -127,7 +129,7 @@
   },
   "metadata": {},
   "max_iterations": 200,
-  "max_cost": 1.5,
+  "max_cost": 2.0,
   "selector": {
     "selector_class": "moatless.selector.simple.SimpleSelector"
   },
 
@@ -16,6 +16,8 @@
 from moatless.flow.search_tree import SearchTree
 
 from .schema import (
+    AddDatasetRequestDTO,
+    AddInstancesRequestDTO,
     DatasetDTO,
     DatasetsResponseDTO,
     EvaluationRequestDTO,
@@ -34,13 +36,11 @@
 @router.post("/evaluations", response_model=Evaluation)
 async def create_evaluation(
     request: EvaluationRequestDTO,
-    model_manager: ModelConfigManager = Depends(get_model_manager),
-    flow_manager: FlowManager = Depends(get_flow_manager),
     evaluation_manager: EvaluationManager = Depends(get_evaluation_manager),
 ):
     """Create a new evaluation run for a dataset."""
     logger.info(
-        f"Creating evaluation for dataset {request.dataset} with flow {request.flow_id} and model {request.model_id} and litellm_model_name {request.litellm_model_name}"
+        f"Creating evaluation {request.name} for dataset {request.dataset} with flow {request.flow_id} and model {request.model_id} and litellm_model_name {request.litellm_model_name}"
     )
 
     if request.flow:
@@ -49,6 +49,7 @@ async def create_evaluation(
         flow = None
 
     evaluation = await evaluation_manager.create_evaluation(
+        evaluation_name=request.name,
         dataset_name=request.dataset,
         instance_ids=request.instance_ids,
         flow_id=request.flow_id,
@@ -220,6 +221,40 @@ async def cancel_evaluation_jobs(
         raise HTTPException(status_code=500, detail=str(e))
 
 
+@router.post("/evaluations/{evaluation_name}/instances", response_model=Evaluation)
+async def add_instances_to_evaluation(
+    evaluation_name: str,
+    request: AddInstancesRequestDTO,
+    evaluation_manager: EvaluationManager = Depends(get_evaluation_manager),
+):
+    """Add new instances to an existing evaluation."""
+    try:
+        evaluation = await evaluation_manager.add_instances_to_evaluation(evaluation_name, request.instance_ids)
+        return evaluation
+    except ValueError as e:
+        raise HTTPException(status_code=404, detail=str(e))
+    except Exception as e:
+        logger.exception(f"Failed to add instances to evaluation: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post("/evaluations/{evaluation_name}/dataset", response_model=Evaluation)
+async def add_dataset_to_evaluation(
+    evaluation_name: str,
+    request: AddDatasetRequestDTO,
+    evaluation_manager: EvaluationManager = Depends(get_evaluation_manager),
+):
+    """Add all instances from a dataset to an existing evaluation."""
+    try:
+        evaluation = await evaluation_manager.add_dataset_to_evaluation(evaluation_name, request.dataset_name)
+        return evaluation
+    except ValueError as e:
+        raise HTTPException(status_code=404, detail=str(e))
+    except Exception as e:
+        logger.exception(f"Failed to add dataset to evaluation: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
 @router.post("/evaluations/{evaluation_name}/instances/{instance_id}/start")
 async def start_instance(
     evaluation_name: str, instance_id: str, evaluation_manager: EvaluationManager = Depends(get_evaluation_manager)
 
@@ -81,6 +81,18 @@ class EvaluationRequestDTO(BaseModel):
     instance_ids: Optional[list[str]] = None
 
 
+class AddInstancesRequestDTO(BaseModel):
+    """Request for adding instances to an existing evaluation"""
+
+    instance_ids: list[str] = Field(..., description="List of instance IDs to add to the evaluation")
+
+
+class AddDatasetRequestDTO(BaseModel):
+    """Request for adding a dataset to an existing evaluation"""
+
+    dataset_name: str = Field(..., description="Name of the dataset to add to the evaluation")
+
+
 class DatasetDTO(BaseModel):
     """DTO for dataset information"""
 
 
@@ -74,14 +74,7 @@ async def get_trajectory_logs(
     Returns:
         The log file contents
     """
-    try:
-        return await flow_manager.get_trajectory_logs(project_id, trajectory_id, file_name)
-    except ValueError as e:
-        logger.exception(f"Error getting trajectory logs: {str(e)}")
-        raise HTTPException(status_code=404, detail=str(e))
-    except Exception as e:
-        logger.exception(f"Error getting trajectory logs: {str(e)}")
-        raise HTTPException(status_code=500, detail=str(e))
+    return await flow_manager.get_trajectory_logs(project_id, trajectory_id, file_name)
 
 
 @router.get("/{project_id}/{trajectory_id}/events")
@@ -146,15 +139,9 @@ async def retry_trajectory(
     flow_manager: FlowManager = Depends(get_flow_manager),
 ):
     """Reset and restart a trajectory by removing all children from the root node."""
-    try:
-        await flow_manager.retry_trajectory(project_id, trajectory_id)
-        return {"status": "success", "message": f"Retried trajectory {trajectory_id}"}
-    except ValueError as e:
-        logger.exception(f"Error retrying trajectory: {str(e)}")
-        raise HTTPException(status_code=400, detail=str(e))
-    except Exception as e:
-        logger.exception(f"Error retrying trajectory: {str(e)}")
-        raise HTTPException(status_code=500, detail=str(e))
+    await flow_manager.retry_trajectory(project_id, trajectory_id)
+    return {"status": "success", "message": f"Retried trajectory {trajectory_id}"}
+
 
 
 @router.post("/{project_id}/{trajectory_id}/resume")
@@ -226,14 +213,7 @@ async def get_node_evaluation_files(
     Returns:
         A dictionary mapping file names to file contents
     """
-    try:
-        return await flow_manager.get_node_evaluation_files(project_id, trajectory_id, node_id)
-    except ValueError as e:
-        logger.exception(f"Error getting node evaluation files: {str(e)}")
-        raise HTTPException(status_code=404, detail=str(e))
-    except Exception as e:
-        logger.exception(f"Error getting node evaluation files: {str(e)}")
-        raise HTTPException(status_code=500, detail=str(e))
+    return await flow_manager.get_node_evaluation_files(project_id, trajectory_id, node_id)
 
 
 @router.get("/{project_id}/{trajectory_id}/chat-messages")
 
@@ -80,8 +80,7 @@ async def create_evaluation(
 
         if not evaluation_name:
             evaluation_name = f"eval_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}_{flow_id}_{dataset_name}"
-
-        logger.info(f"Creating evaluation: {evaluation_name}")
+            logger.info(f"Use generated evaluation name: {evaluation_name}")
 
         if not instance_ids:
             instance_ids = self.get_dataset_instance_ids(dataset_name)
@@ -179,6 +178,80 @@ async def clone_evaluation(self, evaluation_name: str) -> Evaluation:
             instance_ids=instance_ids,
         )
 
+    async def add_instances_to_evaluation(self, evaluation_name: str, instance_ids: list[str]) -> Evaluation:
+        """Add new instances to an existing evaluation."""
+        evaluation = await self._load_evaluation(evaluation_name)
+        if not evaluation:
+            raise ValueError(f"Evaluation {evaluation_name} not found")
+
+        # Find existing instance IDs
+        existing_instance_ids = {instance.instance_id for instance in evaluation.instances}
+        
+        # Filter out instances that already exist
+        new_instance_ids = [instance_id for instance_id in instance_ids if instance_id not in existing_instance_ids]
+        
+        if not new_instance_ids:
+            logger.info(f"All provided instances already exist in evaluation {evaluation_name}")
+            return evaluation
+        
+        logger.info(f"Adding {len(new_instance_ids)} new instances to evaluation {evaluation_name}")
+        
+        # Create new EvaluationInstance objects
+        new_instances = [EvaluationInstance(instance_id=instance_id) for instance_id in new_instance_ids]
+        
+        # Add them to the evaluation
+        evaluation.instances.extend(new_instances)
+        
+        # Create trajectories for the new instances
+        for instance in new_instances:
+            await self._create_trajectory(evaluation, instance)
+        
+        # Save the updated evaluation
+        await self._save_evaluation(evaluation)
+        
+        logger.info(f"Successfully added {len(new_instances)} instances to evaluation {evaluation_name}")
+        return evaluation
+
+    async def add_dataset_to_evaluation(self, evaluation_name: str, dataset_name: str) -> Evaluation:
+        """Add all instances from a dataset to an existing evaluation."""
+        evaluation = await self._load_evaluation(evaluation_name)
+        if not evaluation:
+            raise ValueError(f"Evaluation {evaluation_name} not found")
+
+        try:
+            # Get all instance IDs from the dataset
+            dataset_instance_ids = self.get_dataset_instance_ids(dataset_name)
+        except ValueError as e:
+            raise ValueError(f"Dataset {dataset_name} not found: {str(e)}")
+        
+        # Find existing instance IDs
+        existing_instance_ids = {instance.instance_id for instance in evaluation.instances}
+        
+        # Filter out instances that already exist
+        new_instance_ids = [instance_id for instance_id in dataset_instance_ids if instance_id not in existing_instance_ids]
+        
+        if not new_instance_ids:
+            logger.info(f"All instances from dataset {dataset_name} already exist in evaluation {evaluation_name}")
+            return evaluation
+        
+        logger.info(f"Adding {len(new_instance_ids)} instances from dataset {dataset_name} to evaluation {evaluation_name}")
+        
+        # Create new EvaluationInstance objects
+        new_instances = [EvaluationInstance(instance_id=instance_id) for instance_id in new_instance_ids]
+        
+        # Add them to the evaluation
+        evaluation.instances.extend(new_instances)
+        
+        # Create trajectories for the new instances
+        for instance in new_instances:
+            await self._create_trajectory(evaluation, instance)
+        
+        # Save the updated evaluation
+        await self._save_evaluation(evaluation)
+        
+        logger.info(f"Successfully added {len(new_instances)} instances from dataset {dataset_name} to evaluation {evaluation_name}")
+        return evaluation
+
     @tracer.start_as_current_span("EvaluationManager.start_evaluation")
     async def start_evaluation(self, evaluation_name: str) -> Evaluation:
         """Start an evaluation by running all instances."""
@@ -396,7 +469,8 @@ async def _process_trajectory_results(
             project_id=evaluation.evaluation_name,
             trajectory_id=instance.instance_id,
         ):
-            logger.warning(f"Instance {instance.instance_id} not found in storage {self.storage}")
+            logger.warning(f"Instance {instance.instance_id} not found in storage {self.storage}, will create it")
+            await self._create_trajectory(evaluation, instance)
             return instance
 
         try:
@@ -416,24 +490,31 @@ async def _process_trajectory_results(
 
             if node.reward:
                 instance.reward = node.reward.value
+                
+            if len(node.get_all_nodes()) > 1:
+                # TODO: Second created nod is the best indication on start time...
+                instance.started_at = node.get_all_nodes()[1].timestamp
+                instance.completed_at = flow.root.get_all_nodes()[-1].timestamp
 
             logger.debug(f"Instance {instance.instance_id} flow is finished: {flow.is_finished()}")
             if flow.is_finished():
                 instance.execution_status = ExecutionStatus.COMPLETED
             else:
                 instance.execution_status = ExecutionStatus.CREATED
 
-            if node.evaluation_result and node.evaluation_result.resolved is not None:
-                # Set resolution status based on evaluation
-                instance.set_resolution(node.evaluation_result.resolved)
-                logger.info(f"Instance {instance.instance_id} resolution: {instance.resolution_status}")
+            if node.evaluation_result:
+                if node.evaluation_result.resolved is not None:
+                    # Set resolution status based on evaluation
+                    instance.set_resolution(node.evaluation_result.resolved)
+                    logger.info(f"Instance {instance.instance_id} resolution: {instance.resolution_status}")
 
-            
-            if node.evaluation_result and node.evaluation_result.details:
-                p2p_failues = node.evaluation_result.details.get("tests_status", {}).get("PASS_TO_PASS", {}).get("failure", [])
-                logger.info(f"Instance {instance.instance_id} p2p failures: {p2p_failues}")
-                if p2p_failues and not "p2p_failures" in instance.issues:
-                    instance.issues.append("p2p_failures")
+                if node.evaluation_result.details:
+                    p2p_failues = node.evaluation_result.details.get("tests_status", {}).get("PASS_TO_PASS", {}).get("failure", [])
+                    if p2p_failues and not "p2p_failures" in instance.issues:
+                        instance.issues.append("p2p_failures")
+                
+                logger.info(f"Instance {instance.instance_id} evaluated at: {node.evaluation_result.end_time}")
+                instance.evaluated_at = node.evaluation_result.end_time
 
             if instance.resolution_status != ResolutionStatus.RESOLVED:
                 logger.debug(f"Instance {instance.instance_id} and node {node.node_id} has no evaluation result")
@@ -729,14 +810,18 @@ async def start_instance(self, evaluation_name: str, instance_id: str) -> Evalua
 
         flow = await self._flow_manager.get_flow(evaluation.evaluation_name, instance.instance_id)
         should_save = False
+        evaluated = False
         for leaf_node in flow.root.get_leaf_nodes():
             if leaf_node.error:
                 logger.info(f"Resetting node {leaf_node.node_id} with error")
                 leaf_node.reset()
                 should_save = True
+            
+            if leaf_node.evaluation_result:
+                evaluated = True
 
         # Skip if already completed and evaluated
-        if instance.is_finished() and instance.is_evaluated():
+        if instance.is_finished() and evaluated:
             finish_reason = flow.is_finished()
             if finish_reason:
                 logger.info(f"Instance {instance_id} is already completed and evaluated, skipping: {finish_reason}")
@@ -881,10 +966,11 @@ async def get_evaluation_stats(self, evaluation_name: str) -> EvaluationStats:
         if not evaluation:
             raise ValueError(f"Evaluation {evaluation_name} not found")
 
+        instances = [i for i in evaluation.instances if i.is_finished()]
         # Calculate basic metrics
-        total_instances = len(evaluation.instances)
-        resolved_instances = sum(1 for instance in evaluation.instances if instance.resolved is True)
-        failed_instances = sum(1 for instance in evaluation.instances if instance.resolved is False)
+        total_instances = len(instances)
+        resolved_instances = sum(1 for instance in instances if instance.resolved is True)
+        failed_instances = sum(1 for instance in instances if instance.resolved is False)
         success_rate = (resolved_instances / total_instances) * 100 if total_instances > 0 else 0
 
         # Calculate cost and token metrics
@@ -896,7 +982,7 @@ async def get_evaluation_stats(self, evaluation_name: str) -> EvaluationStats:
         iteration_values = []
         cost_values = []
 
-        for instance in evaluation.instances:
+        for instance in instances:
             if instance.usage:
                 cost = instance.usage.completion_cost or 0
                 total_cost += cost
@@ -958,7 +1044,7 @@ async def get_evaluation_stats(self, evaluation_name: str) -> EvaluationStats:
 
         # Per-repo statistics
         repo_data = {}
-        for instance in evaluation.instances:
+        for instance in instances:
             try:
                 swebench_instance = get_swebench_instance(instance.instance_id)
                 repo = swebench_instance["repo"]