Skip to content

Commit ac13493

Browse files
committed
Update API
1 parent ad7e75a commit ac13493

File tree

10 files changed

+252
-57
lines changed

10 files changed

+252
-57
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,4 +181,5 @@ instances
181181

182182
playground
183183
CLAUDE.md
184-
hej2
184+
hej2
185+
.DS_Store

.moatless/flows/swebench_claude.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
{
22
"id": "swebench_claude",
33
"description": "SWE-bench flow using structured tool calling with Claude Sonnet 4 and built-in reasoning capabilities. (Copy)",
4+
"project_id": null,
5+
"trajectory_id": null,
46
"agent": {
57
"agent_id": null,
68
"model_id": "claude-sonnet-4-20250514-thinking",
@@ -127,7 +129,7 @@
127129
},
128130
"metadata": {},
129131
"max_iterations": 200,
130-
"max_cost": 1.5,
132+
"max_cost": 2.0,
131133
"selector": {
132134
"selector_class": "moatless.selector.simple.SimpleSelector"
133135
},

moatless/api/swebench/api.py

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
from moatless.flow.search_tree import SearchTree
1717

1818
from .schema import (
19+
AddDatasetRequestDTO,
20+
AddInstancesRequestDTO,
1921
DatasetDTO,
2022
DatasetsResponseDTO,
2123
EvaluationRequestDTO,
@@ -34,13 +36,11 @@
3436
@router.post("/evaluations", response_model=Evaluation)
3537
async def create_evaluation(
3638
request: EvaluationRequestDTO,
37-
model_manager: ModelConfigManager = Depends(get_model_manager),
38-
flow_manager: FlowManager = Depends(get_flow_manager),
3939
evaluation_manager: EvaluationManager = Depends(get_evaluation_manager),
4040
):
4141
"""Create a new evaluation run for a dataset."""
4242
logger.info(
43-
f"Creating evaluation for dataset {request.dataset} with flow {request.flow_id} and model {request.model_id} and litellm_model_name {request.litellm_model_name}"
43+
f"Creating evaluation {request.name} for dataset {request.dataset} with flow {request.flow_id} and model {request.model_id} and litellm_model_name {request.litellm_model_name}"
4444
)
4545

4646
if request.flow:
@@ -49,6 +49,7 @@ async def create_evaluation(
4949
flow = None
5050

5151
evaluation = await evaluation_manager.create_evaluation(
52+
evaluation_name=request.name,
5253
dataset_name=request.dataset,
5354
instance_ids=request.instance_ids,
5455
flow_id=request.flow_id,
@@ -220,6 +221,40 @@ async def cancel_evaluation_jobs(
220221
raise HTTPException(status_code=500, detail=str(e))
221222

222223

224+
@router.post("/evaluations/{evaluation_name}/instances", response_model=Evaluation)
225+
async def add_instances_to_evaluation(
226+
evaluation_name: str,
227+
request: AddInstancesRequestDTO,
228+
evaluation_manager: EvaluationManager = Depends(get_evaluation_manager),
229+
):
230+
"""Add new instances to an existing evaluation."""
231+
try:
232+
evaluation = await evaluation_manager.add_instances_to_evaluation(evaluation_name, request.instance_ids)
233+
return evaluation
234+
except ValueError as e:
235+
raise HTTPException(status_code=404, detail=str(e))
236+
except Exception as e:
237+
logger.exception(f"Failed to add instances to evaluation: {str(e)}")
238+
raise HTTPException(status_code=500, detail=str(e))
239+
240+
241+
@router.post("/evaluations/{evaluation_name}/dataset", response_model=Evaluation)
242+
async def add_dataset_to_evaluation(
243+
evaluation_name: str,
244+
request: AddDatasetRequestDTO,
245+
evaluation_manager: EvaluationManager = Depends(get_evaluation_manager),
246+
):
247+
"""Add all instances from a dataset to an existing evaluation."""
248+
try:
249+
evaluation = await evaluation_manager.add_dataset_to_evaluation(evaluation_name, request.dataset_name)
250+
return evaluation
251+
except ValueError as e:
252+
raise HTTPException(status_code=404, detail=str(e))
253+
except Exception as e:
254+
logger.exception(f"Failed to add dataset to evaluation: {str(e)}")
255+
raise HTTPException(status_code=500, detail=str(e))
256+
257+
223258
@router.post("/evaluations/{evaluation_name}/instances/{instance_id}/start")
224259
async def start_instance(
225260
evaluation_name: str, instance_id: str, evaluation_manager: EvaluationManager = Depends(get_evaluation_manager)

moatless/api/swebench/schema.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,18 @@ class EvaluationRequestDTO(BaseModel):
8181
instance_ids: Optional[list[str]] = None
8282

8383

84+
class AddInstancesRequestDTO(BaseModel):
85+
"""Request for adding instances to an existing evaluation"""
86+
87+
instance_ids: list[str] = Field(..., description="List of instance IDs to add to the evaluation")
88+
89+
90+
class AddDatasetRequestDTO(BaseModel):
91+
"""Request for adding a dataset to an existing evaluation"""
92+
93+
dataset_name: str = Field(..., description="Name of the dataset to add to the evaluation")
94+
95+
8496
class DatasetDTO(BaseModel):
8597
"""DTO for dataset information"""
8698

moatless/api/trajectories/api.py

Lines changed: 5 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -74,14 +74,7 @@ async def get_trajectory_logs(
7474
Returns:
7575
The log file contents
7676
"""
77-
try:
78-
return await flow_manager.get_trajectory_logs(project_id, trajectory_id, file_name)
79-
except ValueError as e:
80-
logger.exception(f"Error getting trajectory logs: {str(e)}")
81-
raise HTTPException(status_code=404, detail=str(e))
82-
except Exception as e:
83-
logger.exception(f"Error getting trajectory logs: {str(e)}")
84-
raise HTTPException(status_code=500, detail=str(e))
77+
return await flow_manager.get_trajectory_logs(project_id, trajectory_id, file_name)
8578

8679

8780
@router.get("/{project_id}/{trajectory_id}/events")
@@ -146,15 +139,9 @@ async def retry_trajectory(
146139
flow_manager: FlowManager = Depends(get_flow_manager),
147140
):
148141
"""Reset and restart a trajectory by removing all children from the root node."""
149-
try:
150-
await flow_manager.retry_trajectory(project_id, trajectory_id)
151-
return {"status": "success", "message": f"Retried trajectory {trajectory_id}"}
152-
except ValueError as e:
153-
logger.exception(f"Error retrying trajectory: {str(e)}")
154-
raise HTTPException(status_code=400, detail=str(e))
155-
except Exception as e:
156-
logger.exception(f"Error retrying trajectory: {str(e)}")
157-
raise HTTPException(status_code=500, detail=str(e))
142+
await flow_manager.retry_trajectory(project_id, trajectory_id)
143+
return {"status": "success", "message": f"Retried trajectory {trajectory_id}"}
144+
158145

159146

160147
@router.post("/{project_id}/{trajectory_id}/resume")
@@ -226,14 +213,7 @@ async def get_node_evaluation_files(
226213
Returns:
227214
A dictionary mapping file names to file contents
228215
"""
229-
try:
230-
return await flow_manager.get_node_evaluation_files(project_id, trajectory_id, node_id)
231-
except ValueError as e:
232-
logger.exception(f"Error getting node evaluation files: {str(e)}")
233-
raise HTTPException(status_code=404, detail=str(e))
234-
except Exception as e:
235-
logger.exception(f"Error getting node evaluation files: {str(e)}")
236-
raise HTTPException(status_code=500, detail=str(e))
216+
return await flow_manager.get_node_evaluation_files(project_id, trajectory_id, node_id)
237217

238218

239219
@router.get("/{project_id}/{trajectory_id}/chat-messages")

moatless/evaluation/manager.py

Lines changed: 105 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,7 @@ async def create_evaluation(
8080

8181
if not evaluation_name:
8282
evaluation_name = f"eval_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}_{flow_id}_{dataset_name}"
83-
84-
logger.info(f"Creating evaluation: {evaluation_name}")
83+
logger.info(f"Use generated evaluation name: {evaluation_name}")
8584

8685
if not instance_ids:
8786
instance_ids = self.get_dataset_instance_ids(dataset_name)
@@ -179,6 +178,80 @@ async def clone_evaluation(self, evaluation_name: str) -> Evaluation:
179178
instance_ids=instance_ids,
180179
)
181180

181+
async def add_instances_to_evaluation(self, evaluation_name: str, instance_ids: list[str]) -> Evaluation:
182+
"""Add new instances to an existing evaluation."""
183+
evaluation = await self._load_evaluation(evaluation_name)
184+
if not evaluation:
185+
raise ValueError(f"Evaluation {evaluation_name} not found")
186+
187+
# Find existing instance IDs
188+
existing_instance_ids = {instance.instance_id for instance in evaluation.instances}
189+
190+
# Filter out instances that already exist
191+
new_instance_ids = [instance_id for instance_id in instance_ids if instance_id not in existing_instance_ids]
192+
193+
if not new_instance_ids:
194+
logger.info(f"All provided instances already exist in evaluation {evaluation_name}")
195+
return evaluation
196+
197+
logger.info(f"Adding {len(new_instance_ids)} new instances to evaluation {evaluation_name}")
198+
199+
# Create new EvaluationInstance objects
200+
new_instances = [EvaluationInstance(instance_id=instance_id) for instance_id in new_instance_ids]
201+
202+
# Add them to the evaluation
203+
evaluation.instances.extend(new_instances)
204+
205+
# Create trajectories for the new instances
206+
for instance in new_instances:
207+
await self._create_trajectory(evaluation, instance)
208+
209+
# Save the updated evaluation
210+
await self._save_evaluation(evaluation)
211+
212+
logger.info(f"Successfully added {len(new_instances)} instances to evaluation {evaluation_name}")
213+
return evaluation
214+
215+
async def add_dataset_to_evaluation(self, evaluation_name: str, dataset_name: str) -> Evaluation:
216+
"""Add all instances from a dataset to an existing evaluation."""
217+
evaluation = await self._load_evaluation(evaluation_name)
218+
if not evaluation:
219+
raise ValueError(f"Evaluation {evaluation_name} not found")
220+
221+
try:
222+
# Get all instance IDs from the dataset
223+
dataset_instance_ids = self.get_dataset_instance_ids(dataset_name)
224+
except ValueError as e:
225+
raise ValueError(f"Dataset {dataset_name} not found: {str(e)}")
226+
227+
# Find existing instance IDs
228+
existing_instance_ids = {instance.instance_id for instance in evaluation.instances}
229+
230+
# Filter out instances that already exist
231+
new_instance_ids = [instance_id for instance_id in dataset_instance_ids if instance_id not in existing_instance_ids]
232+
233+
if not new_instance_ids:
234+
logger.info(f"All instances from dataset {dataset_name} already exist in evaluation {evaluation_name}")
235+
return evaluation
236+
237+
logger.info(f"Adding {len(new_instance_ids)} instances from dataset {dataset_name} to evaluation {evaluation_name}")
238+
239+
# Create new EvaluationInstance objects
240+
new_instances = [EvaluationInstance(instance_id=instance_id) for instance_id in new_instance_ids]
241+
242+
# Add them to the evaluation
243+
evaluation.instances.extend(new_instances)
244+
245+
# Create trajectories for the new instances
246+
for instance in new_instances:
247+
await self._create_trajectory(evaluation, instance)
248+
249+
# Save the updated evaluation
250+
await self._save_evaluation(evaluation)
251+
252+
logger.info(f"Successfully added {len(new_instances)} instances from dataset {dataset_name} to evaluation {evaluation_name}")
253+
return evaluation
254+
182255
@tracer.start_as_current_span("EvaluationManager.start_evaluation")
183256
async def start_evaluation(self, evaluation_name: str) -> Evaluation:
184257
"""Start an evaluation by running all instances."""
@@ -396,7 +469,8 @@ async def _process_trajectory_results(
396469
project_id=evaluation.evaluation_name,
397470
trajectory_id=instance.instance_id,
398471
):
399-
logger.warning(f"Instance {instance.instance_id} not found in storage {self.storage}")
472+
logger.warning(f"Instance {instance.instance_id} not found in storage {self.storage}, will create it")
473+
await self._create_trajectory(evaluation, instance)
400474
return instance
401475

402476
try:
@@ -416,24 +490,31 @@ async def _process_trajectory_results(
416490

417491
if node.reward:
418492
instance.reward = node.reward.value
493+
494+
if len(node.get_all_nodes()) > 1:
495+
# TODO: Second created nod is the best indication on start time...
496+
instance.started_at = node.get_all_nodes()[1].timestamp
497+
instance.completed_at = flow.root.get_all_nodes()[-1].timestamp
419498

420499
logger.debug(f"Instance {instance.instance_id} flow is finished: {flow.is_finished()}")
421500
if flow.is_finished():
422501
instance.execution_status = ExecutionStatus.COMPLETED
423502
else:
424503
instance.execution_status = ExecutionStatus.CREATED
425504

426-
if node.evaluation_result and node.evaluation_result.resolved is not None:
427-
# Set resolution status based on evaluation
428-
instance.set_resolution(node.evaluation_result.resolved)
429-
logger.info(f"Instance {instance.instance_id} resolution: {instance.resolution_status}")
505+
if node.evaluation_result:
506+
if node.evaluation_result.resolved is not None:
507+
# Set resolution status based on evaluation
508+
instance.set_resolution(node.evaluation_result.resolved)
509+
logger.info(f"Instance {instance.instance_id} resolution: {instance.resolution_status}")
430510

431-
432-
if node.evaluation_result and node.evaluation_result.details:
433-
p2p_failues = node.evaluation_result.details.get("tests_status", {}).get("PASS_TO_PASS", {}).get("failure", [])
434-
logger.info(f"Instance {instance.instance_id} p2p failures: {p2p_failues}")
435-
if p2p_failues and not "p2p_failures" in instance.issues:
436-
instance.issues.append("p2p_failures")
511+
if node.evaluation_result.details:
512+
p2p_failues = node.evaluation_result.details.get("tests_status", {}).get("PASS_TO_PASS", {}).get("failure", [])
513+
if p2p_failues and not "p2p_failures" in instance.issues:
514+
instance.issues.append("p2p_failures")
515+
516+
logger.info(f"Instance {instance.instance_id} evaluated at: {node.evaluation_result.end_time}")
517+
instance.evaluated_at = node.evaluation_result.end_time
437518

438519
if instance.resolution_status != ResolutionStatus.RESOLVED:
439520
logger.debug(f"Instance {instance.instance_id} and node {node.node_id} has no evaluation result")
@@ -729,14 +810,18 @@ async def start_instance(self, evaluation_name: str, instance_id: str) -> Evalua
729810

730811
flow = await self._flow_manager.get_flow(evaluation.evaluation_name, instance.instance_id)
731812
should_save = False
813+
evaluated = False
732814
for leaf_node in flow.root.get_leaf_nodes():
733815
if leaf_node.error:
734816
logger.info(f"Resetting node {leaf_node.node_id} with error")
735817
leaf_node.reset()
736818
should_save = True
819+
820+
if leaf_node.evaluation_result:
821+
evaluated = True
737822

738823
# Skip if already completed and evaluated
739-
if instance.is_finished() and instance.is_evaluated():
824+
if instance.is_finished() and evaluated:
740825
finish_reason = flow.is_finished()
741826
if finish_reason:
742827
logger.info(f"Instance {instance_id} is already completed and evaluated, skipping: {finish_reason}")
@@ -881,10 +966,11 @@ async def get_evaluation_stats(self, evaluation_name: str) -> EvaluationStats:
881966
if not evaluation:
882967
raise ValueError(f"Evaluation {evaluation_name} not found")
883968

969+
instances = [i for i in evaluation.instances if i.is_finished()]
884970
# Calculate basic metrics
885-
total_instances = len(evaluation.instances)
886-
resolved_instances = sum(1 for instance in evaluation.instances if instance.resolved is True)
887-
failed_instances = sum(1 for instance in evaluation.instances if instance.resolved is False)
971+
total_instances = len(instances)
972+
resolved_instances = sum(1 for instance in instances if instance.resolved is True)
973+
failed_instances = sum(1 for instance in instances if instance.resolved is False)
888974
success_rate = (resolved_instances / total_instances) * 100 if total_instances > 0 else 0
889975

890976
# Calculate cost and token metrics
@@ -896,7 +982,7 @@ async def get_evaluation_stats(self, evaluation_name: str) -> EvaluationStats:
896982
iteration_values = []
897983
cost_values = []
898984

899-
for instance in evaluation.instances:
985+
for instance in instances:
900986
if instance.usage:
901987
cost = instance.usage.completion_cost or 0
902988
total_cost += cost
@@ -958,7 +1044,7 @@ async def get_evaluation_stats(self, evaluation_name: str) -> EvaluationStats:
9581044

9591045
# Per-repo statistics
9601046
repo_data = {}
961-
for instance in evaluation.instances:
1047+
for instance in instances:
9621048
try:
9631049
swebench_instance = get_swebench_instance(instance.instance_id)
9641050
repo = swebench_instance["repo"]

0 commit comments

Comments
 (0)