Skip to content

Commit e944403

Browse files
fix: resolve critical Terminal-Bench integration issues
- Fix missing os/Dict imports causing NameError in installed agent - Replace sync agent.start() with async agent.astart() to prevent event loop blocking - Fix broken approval API from set_approval_backend to get_approval_registry().set_backend() - Use correct agent metrics (cost_summary, _total_tokens_*) instead of non-existent _usage/_cost - Add agent-scoped approval backends with proper cleanup to prevent global state pollution - Implement populate_context_post_run JSON parsing for Harbor metrics integration - Fix markdown lint issue by adding language tag to fenced code block Addresses P0/P1 issues identified by CodeRabbit, Greptile, Copilot reviewers. Co-authored-by: Mervin Praison <MervinPraison@users.noreply.github.com>
1 parent 1248e77 commit e944403

File tree

4 files changed

+85
-46
lines changed

4 files changed

+85
-46
lines changed

examples/terminal_bench/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ harbor run -d terminal-bench/terminal-bench-2 \
6464

6565
## Architecture
6666

67-
```
67+
```text
6868
┌─────────────────────────────────────────┐
6969
│ Harbor Framework │
7070
│ (Terminal-Bench 2.0 Evaluation) │

examples/terminal_bench/multi_agent_example.py

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,8 @@ async def run(
6868

6969
# Set auto-approval for container safety
7070
registry = get_approval_registry()
71-
registry.set_backend(AutoApproveBackend())
71+
original_backend = registry.get_backend()
72+
registry.set_backend(AutoApproveBackend(), agent_name="multi-agent-planner")
7273

7374
try:
7475
# Create bash tool that wraps Harbor environment
@@ -136,19 +137,19 @@ async def bash_tool(command: str) -> str:
136137

137138
# Phase 1: Planning
138139
print("📋 Phase 1: Task Planning")
139-
plan = planner.start(f"Create a detailed plan for: {instruction}")
140+
plan = await planner.astart(f"Create a detailed plan for: {instruction}")
140141
print(f"Plan created: {len(plan.split('.')) if plan else 0} steps")
141142

142143
# Phase 2: Execution
143144
print("⚡ Phase 2: Task Execution")
144145
execution_prompt = f"Execute this plan step by step:\n\nOriginal task: {instruction}\n\nPlan:\n{plan}"
145-
execution_result = executor.start(execution_prompt)
146+
execution_result = await executor.astart(execution_prompt)
146147
print("Execution completed")
147148

148149
# Phase 3: Verification
149150
print("✅ Phase 3: Solution Verification")
150151
verification_prompt = f"Verify this solution works correctly:\n\nOriginal task: {instruction}\n\nSolution: {execution_result}\n\nRun tests to confirm it works."
151-
verification_result = verifier.start(verification_prompt)
152+
verification_result = await verifier.astart(verification_prompt)
152153
print("Verification completed")
153154

154155
# Combine results
@@ -168,8 +169,11 @@ async def bash_tool(command: str) -> str:
168169
context.metadata = {"error": str(e)}
169170
raise
170171
finally:
171-
# Reset approval backend (optional)
172-
pass
172+
# Restore original approval backend to avoid global state pollution
173+
if original_backend:
174+
registry.set_backend(original_backend)
175+
else:
176+
registry.remove_backend(agent_name="multi-agent-planner")
173177

174178
def _populate_context(self, agents: list, context: AgentContext, result: Dict[str, Any]) -> None:
175179
"""Populate Harbor context with multi-agent metrics."""
@@ -180,11 +184,10 @@ def _populate_context(self, agents: list, context: AgentContext, result: Dict[st
180184
total_cost = 0.0
181185

182186
for agent in agents:
183-
if hasattr(agent, '_usage') and agent._usage:
184-
total_input_tokens += getattr(agent._usage, 'input_tokens', 0) or 0
185-
total_output_tokens += getattr(agent._usage, 'output_tokens', 0) or 0
186-
if hasattr(agent, '_cost') and agent._cost:
187-
total_cost += agent._cost
187+
# Use agent's actual metrics properties
188+
total_input_tokens += getattr(agent, '_total_tokens_in', 0)
189+
total_output_tokens += getattr(agent, '_total_tokens_out', 0)
190+
total_cost += agent.total_cost or 0.0
188191

189192
context.n_input_tokens = total_input_tokens if total_input_tokens > 0 else None
190193
context.n_output_tokens = total_output_tokens if total_output_tokens > 0 else None
@@ -232,7 +235,8 @@ async def run(
232235
"""Run structured AgentTeam workflow."""
233236

234237
registry = get_approval_registry()
235-
registry.set_backend(AutoApproveBackend())
238+
original_backend = registry.get_backend()
239+
registry.set_backend(AutoApproveBackend(), agent_name="agent-team")
236240

237241
try:
238242
# Create bash tool
@@ -283,7 +287,7 @@ async def bash_tool(command: str) -> str:
283287
)
284288

285289
print(f"🚀 AgentTeam starting: {instruction[:100]}...")
286-
result = team.start(instruction)
290+
result = await team.astart(instruction)
287291
print("✅ AgentTeam completed")
288292

289293
# Populate context
@@ -296,8 +300,11 @@ async def bash_tool(command: str) -> str:
296300
}
297301

298302
finally:
299-
# Reset approval backend (optional)
300-
pass
303+
# Restore original approval backend to avoid global state pollution
304+
if original_backend:
305+
registry.set_backend(original_backend)
306+
else:
307+
registry.remove_backend(agent_name="agent-team")
301308

302309

303310
if __name__ == "__main__":

examples/terminal_bench/praisonai_external_agent.py

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -71,10 +71,11 @@ async def run(
7171
This method bridges Harbor's BaseEnvironment.exec() to PraisonAI's tool system.
7272
"""
7373

74-
# Set auto-approval for container-isolated execution
74+
# Set auto-approval for container-isolated execution
7575
# Harbor's container provides isolation, so we can safely auto-approve shell commands
7676
registry = get_approval_registry()
77-
registry.set_backend(AutoApproveBackend())
77+
original_backend = registry.get_backend()
78+
registry.set_backend(AutoApproveBackend(), agent_name="terminal-agent")
7879

7980
try:
8081
# Create bash tool that wraps Harbor's environment.exec()
@@ -118,7 +119,7 @@ async def bash_tool(command: str) -> str:
118119

119120
# Execute the agent
120121
print(f"🚀 PraisonAI Agent starting task: {instruction[:100]}...")
121-
result = agent.start(instruction)
122+
result = await agent.astart(instruction)
122123
print(f"✅ PraisonAI Agent completed task")
123124

124125
# Populate Harbor context with metadata
@@ -129,8 +130,11 @@ async def bash_tool(command: str) -> str:
129130
context.metadata = {"error": str(e)}
130131
raise
131132
finally:
132-
# Reset approval backend (optional - could leave auto-approve for future runs)
133-
pass
133+
# Restore original approval backend to avoid global state pollution
134+
if original_backend:
135+
registry.set_backend(original_backend)
136+
else:
137+
registry.remove_backend(agent_name="terminal-agent")
134138

135139
def _populate_context(self, agent: Agent, context: AgentContext, result: Any) -> None:
136140
"""
@@ -139,16 +143,17 @@ def _populate_context(self, agent: Agent, context: AgentContext, result: Any) ->
139143
Harbor tracks: n_input_tokens, n_output_tokens, cost_usd, metadata
140144
"""
141145
try:
142-
# Extract token usage from agent if available
143-
usage = getattr(agent, '_usage', None)
144-
if usage:
145-
context.n_input_tokens = getattr(usage, 'input_tokens', None)
146-
context.n_output_tokens = getattr(usage, 'output_tokens', None)
147-
148-
# Extract cost if available
149-
cost = getattr(agent, '_cost', None)
150-
if cost:
151-
context.cost_usd = cost
146+
# Extract token usage and cost from agent
147+
summary = agent.cost_summary()
148+
if summary:
149+
context.n_input_tokens = summary.get('tokens_in')
150+
context.n_output_tokens = summary.get('tokens_out')
151+
context.cost_usd = summary.get('cost')
152+
else:
153+
# Fallback to direct properties
154+
context.n_input_tokens = getattr(agent, '_total_tokens_in', 0)
155+
context.n_output_tokens = getattr(agent, '_total_tokens_out', 0)
156+
context.cost_usd = agent.total_cost
152157

153158
# Store result summary and agent info
154159
context.metadata = {

examples/terminal_bench/praisonai_installed_agent.py

Lines changed: 42 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,12 @@
1818
to the Harbor project via PR.
1919
"""
2020

21+
import os
2122
import shlex
2223
import json
2324
import asyncio
2425
from pathlib import Path
25-
from typing import Optional
26+
from typing import Dict, Optional
2627

2728
try:
2829
# These imports would work if this file was in Harbor's codebase
@@ -149,10 +150,10 @@ def main():
149150
try:
150151
from praisonaiagents import Agent
151152
from praisonaiagents.tools import execute_command
152-
from praisonaiagents.approval import set_approval_backend, AutoApproveBackend
153+
from praisonaiagents.approval import get_approval_registry, AutoApproveBackend
153154
154155
# Set auto-approval for container-isolated execution
155-
set_approval_backend(AutoApproveBackend())
156+
get_approval_registry().set_backend(AutoApproveBackend())
156157
157158
# Create terminal agent with shell execution capabilities
158159
agent = Agent(
@@ -278,19 +279,45 @@ def populate_context_post_run(self, context: AgentContext) -> None:
278279
JSON output of the headless runner script.
279280
"""
280281
try:
281-
# Get the last execution result
282-
# This would need to be implemented based on Harbor's execution model
283-
# For now, this is a placeholder that shows the structure
284-
285-
# In a real implementation, you'd parse the stdout from the runner script
286-
# and extract the JSON metrics
287-
288-
context.metadata = {
289-
"framework": "praisonai",
290-
"agent_type": "installed",
291-
"version": self.version() if hasattr(self, 'version') else None,
292-
}
282+
# Parse the last stdout output for JSON metrics
283+
# In Harbor's model, the last execution output should contain our JSON
284+
last_output = getattr(context, '_last_stdout', None)
293285

286+
if last_output:
287+
try:
288+
metrics = json.loads(last_output.strip())
289+
290+
# Extract metrics with safe defaults
291+
context.n_input_tokens = metrics.get('input_tokens')
292+
context.n_output_tokens = metrics.get('output_tokens')
293+
context.cost_usd = metrics.get('cost_usd')
294+
295+
# Store additional metadata
296+
context.metadata = {
297+
"framework": "praisonai",
298+
"agent_type": "installed",
299+
"agent_name": metrics.get('agent_name', 'terminal-agent'),
300+
"model": metrics.get('model'),
301+
"tools_used": metrics.get('tools_used', []),
302+
"version": self.get_version() if hasattr(self, 'get_version') else None,
303+
}
304+
305+
except (json.JSONDecodeError, ValueError) as e:
306+
# If JSON parsing fails, store basic metadata
307+
context.metadata = {
308+
"framework": "praisonai",
309+
"agent_type": "installed",
310+
"parse_error": str(e),
311+
"raw_output": str(last_output)[:200] if last_output else None,
312+
}
313+
else:
314+
# No output to parse
315+
context.metadata = {
316+
"framework": "praisonai",
317+
"agent_type": "installed",
318+
"status": "no_output",
319+
}
320+
294321
except Exception as e:
295322
context.metadata = {"context_population_error": str(e)}
296323

0 commit comments

Comments
 (0)