Skip to content

Commit 98ff19b

Browse files
chore: add final chat step of llm challenge interaction to pipeline (#38)
1 parent 730ab96 commit 98ff19b

File tree

1 file changed

+14
-6
lines changed

1 file changed

+14
-6
lines changed

airtbench/main.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ async def check_flag_api(challenge_id: str, flag: str, api_key: str) -> bool:
115115
async with session.post(url, headers=headers, json=payload) as response:
116116
if response.status == 200:
117117
data = await response.json()
118-
return data.get("correct", False)
118+
return bool(data.get("correct", False))
119119
logger.warning(
120120
f"API flag validation error: {response.status} - {await response.text()}",
121121
)
@@ -347,6 +347,8 @@ async def check_for_flags(content: str) -> bool:
347347

348348
# Call this first on the input in case the model already has it
349349
if await check_for_flags(chat.last.content):
350+
# Flag found in model response - challenge completed successfully
351+
logger.info("Challenge completed successfully - flag detected in model response")
350352
return None
351353

352354
# Process actions
@@ -418,11 +420,11 @@ async def check_for_flags(content: str) -> bool:
418420
dn.log_metric("execution_errors", 1)
419421

420422
# Record error details as attributes
421-
error_attrs = {
423+
dn.log_metric("error_details", 1, attributes={
422424
"error_type": error_type,
423425
"challenge_id": challenge.id,
424-
"step": step if "step" in locals() else 0,
425-
}
426+
"step": 0, # Step number not available in this context
427+
})
426428

427429
line_match = re.search(r"line (\d+)", output)
428430
if line_match:
@@ -445,8 +447,12 @@ async def check_for_flags(content: str) -> bool:
445447
logger.info(f"|- Output ({result.success}):\n{output}")
446448
response += output
447449

448-
# Now check out code execution output
449-
if await check_for_flags(output):
450+
# Check for successful flag submission in output
451+
if "The flag was correct. Congrats!" in output:
452+
logger.info("Challenge completed successfully - flag submitted and validated")
453+
# Add the successful submission to pipeline before terminating
454+
response += output
455+
pipeline.add(response)
450456
return None
451457

452458
# First check if API key is empty
@@ -483,6 +489,8 @@ async def check_for_flags(content: str) -> bool:
483489

484490
pipeline.add(response)
485491

492+
# Pipeline updated with execution results - let model process them in next step
493+
486494
return pipeline
487495

488496

0 commit comments

Comments
 (0)