Skip to content
This repository was archived by the owner on Dec 23, 2025. It is now read-only.

Commit 2582cf9

Browse files
tau2 enhancements (#2)
1 parent 1fff7ab commit 2582cf9

File tree

9 files changed

+1377
-185
lines changed

9 files changed

+1377
-185
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,7 @@
33
.python-version
44
.venv/
55
__pycache__/
6-
*.pyc
6+
*.pyc
7+
8+
# tau2-bench data (downloaded via setup.sh)
9+
scenarios/tau2/tau2-bench/

pyproject.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,12 @@ dependencies = [
1414
"google-genai>=1.36.0",
1515
"litellm>=1.0.0",
1616
"loguru>=0.7.0",
17+
"nest-asyncio>=1.6.0",
1718
"pydantic>=2.11.9",
1819
"python-dotenv>=1.1.1",
1920
"uvicorn>=0.35.0",
21+
# tau2 from GitHub
22+
"tau2 @ git+https://github.com/sierra-research/tau2-bench.git",
2023
]
2124

2225
[project.scripts]
@@ -28,5 +31,8 @@ dev-dependencies = [
2831
"mypy>=1.18.1",
2932
]
3033

34+
[tool.hatch.metadata]
35+
allow-direct-references = true
36+
3137
[tool.hatch.build.targets.wheel]
3238
packages = ["src/agentbeats"]

scenarios/tau2/Dockerfile.tau2-evaluator

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,11 @@ RUN \
1212
--mount=type=cache,target=/home/agentbeats/.cache/uv,uid=1000 \
1313
uv sync --locked
1414

15-
# Install tau2 from GitHub (not on PyPI)
16-
RUN \
17-
--mount=type=cache,target=/home/agentbeats/.cache/uv,uid=1000 \
18-
uv pip install "tau2 @ git+https://github.com/sierra-research/tau2-bench.git"
19-
20-
# Download tau2 data
2115
USER root
2216
RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
2317
USER agentbeats
2418

19+
# Download tau2 data
2520
RUN git clone --depth 1 --filter=blob:none --sparse https://github.com/sierra-research/tau2-bench.git /home/agentbeats/tau2-bench && \
2621
cd /home/agentbeats/tau2-bench && \
2722
git sparse-checkout set data

scenarios/tau2/README.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Tau2 Benchmark Scenario
2+
3+
This scenario evaluates agents on the [tau2-bench](https://github.com/sierra-research/tau2-bench) benchmark, which tests customer service agents on realistic tasks.
4+
5+
## Setup
6+
7+
1. **Download the tau2-bench data** (required once):
8+
9+
```bash
10+
./scenarios/tau2/setup.sh
11+
```
12+
13+
2. **Set your API key** in `.env`:
14+
15+
```
16+
OPENAI_API_KEY=your-key-here
17+
```
18+
19+
## Running the Benchmark
20+
21+
```bash
22+
TAU2_DATA_DIR=./scenarios/tau2/tau2-bench/data uv run agentbeats-run scenarios/tau2/scenario.toml
23+
```
24+
25+
## Configuration
26+
27+
Edit `scenario.toml` to configure the benchmark:
28+
29+
```toml
30+
[config]
31+
domain = "airline" # airline, retail, telecom, or mock
32+
num_tasks = 5 # number of tasks to run
33+
user_llm = "openai/gpt-4.1" # LLM for user simulator (optional, defaults to gpt-4.1)
34+
```
35+
36+
The agent LLM defaults to `openai/gpt-4.1` and can be configured via the `--agent-llm` CLI argument in `tau2_agent.py`.
37+
38+
## Architecture
39+
40+
- **tau2_evaluator.py** (Green Agent): Runs the tau2 Orchestrator which coordinates the user simulator, environment, and agent
41+
- **tau2_agent.py** (Purple Agent): The agent being tested - receives task descriptions and responds with tool calls or user responses

scenarios/tau2/scenario.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,4 @@ cmd = "python scenarios/tau2/tau2_agent.py --host 127.0.0.1 --port 9019"
99

1010
[config]
1111
domain = "airline"
12-
num_tasks = 3
12+
num_tasks = 4

scenarios/tau2/setup.sh

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/bin/bash
2+
# Setup script for tau2 scenario
3+
# This downloads the tau2-bench data required to run the benchmark
4+
5+
set -e
6+
7+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
8+
TAU2_BENCH_DIR="$SCRIPT_DIR/tau2-bench"
9+
10+
if [ -d "$TAU2_BENCH_DIR" ]; then
11+
echo "tau2-bench already exists at $TAU2_BENCH_DIR"
12+
echo "To re-download, remove the directory first: rm -rf $TAU2_BENCH_DIR"
13+
exit 0
14+
fi
15+
16+
echo "Cloning tau2-bench repository..."
17+
git clone --depth 1 https://github.com/sierra-research/tau2-bench.git "$TAU2_BENCH_DIR"
18+
19+
echo ""
20+
echo "Setup complete! tau2-bench data is now available at:"
21+
echo " $TAU2_BENCH_DIR/data"
22+
echo ""
23+
echo "To run the tau2 scenario:"
24+
echo " TAU2_DATA_DIR=$TAU2_BENCH_DIR/data uv run agentbeats-run scenarios/tau2/scenario.toml"

scenarios/tau2/tau2_agent.py

Lines changed: 7 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,8 @@
44
This is the agent being tested. It:
55
1. Receives task descriptions with available tools from the green agent
66
2. Decides which tool to call or how to respond
7-
3. Returns responses in the expected JSON format wrapped in <json>...</json> tags
87
"""
98
import argparse
10-
import os
119
import uvicorn
1210
from dotenv import load_dotenv
1311

@@ -45,38 +43,14 @@ def prepare_agent_card(url: str) -> AgentCard:
4543
)
4644

4745

48-
SYSTEM_PROMPT = """You are a helpful customer service agent being evaluated on your ability to solve tasks.
49-
50-
You will receive:
51-
1. A policy describing your role and guidelines
52-
2. A list of available tools you can use
53-
3. User messages that you need to respond to
54-
55-
CRITICAL: You MUST respond in the exact JSON format specified, wrapped in <json>...</json> tags.
56-
57-
For tool calls, respond with:
58-
<json>
59-
{"name": "tool_name", "arguments": {"arg1": "value1", ...}}
60-
</json>
61-
62-
To respond directly to the user, use:
63-
<json>
64-
{"name": "respond", "arguments": {"content": "Your message here"}}
65-
</json>
66-
67-
Rules:
68-
- Only use one tool at a time
69-
- You cannot respond to the user AND use a tool in the same message
70-
- Follow the policy guidelines provided
71-
- Be helpful and accurate
72-
- ALWAYS wrap your response in <json>...</json> tags
73-
"""
46+
SYSTEM_PROMPT = """You are a helpful customer service agent. Follow the policy and tool instructions provided in each message."""
7447

7548

7649
class Tau2AgentExecutor(AgentExecutor):
7750
"""Executor for the tau2 purple agent."""
7851

79-
def __init__(self):
52+
def __init__(self, model: str):
53+
self.model = model
8054
self.ctx_id_to_messages: dict[str, list[dict]] = {}
8155

8256
async def execute(self, context: RequestContext, event_queue: EventQueue) -> None:
@@ -96,8 +70,9 @@ async def execute(self, context: RequestContext, event_queue: EventQueue) -> Non
9670
try:
9771
response = completion(
9872
messages=messages,
99-
model="openai/gpt-4o",
73+
model=self.model,
10074
temperature=0.0,
75+
response_format={ "type": "json_object" },
10176
)
10277
assistant_content = response.choices[0].message.content
10378
logger.info(f"LLM response: {assistant_content[:200]}...")
@@ -122,13 +97,14 @@ def main():
12297
parser.add_argument("--host", type=str, default="127.0.0.1", help="Host to bind the server")
12398
parser.add_argument("--port", type=int, default=9019, help="Port to bind the server")
12499
parser.add_argument("--card-url", type=str, help="External URL for the agent card")
100+
parser.add_argument("--agent-llm", type=str, default="openai/gpt-4.1", help="LLM model to use")
125101
args = parser.parse_args()
126102

127103
logger.info("Starting tau2 agent...")
128104
card = prepare_agent_card(args.card_url or f"http://{args.host}:{args.port}/")
129105

130106
request_handler = DefaultRequestHandler(
131-
agent_executor=Tau2AgentExecutor(),
107+
agent_executor=Tau2AgentExecutor(model=args.agent_llm),
132108
task_store=InMemoryTaskStore(),
133109
)
134110

0 commit comments

Comments
 (0)