hud-blank/local_test.py at main · hud-evals/hud-blank · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
"""Local test script for the blank environment.

Run the backend first: uvicorn backend.app:app --port 8005
Then run this script: python local_test.py
"""

import asyncio

import hud
from hud.agents import OpenAIChatAgent
from openai import AsyncOpenAI

from env import env

# Use HUD inference gateway - see all models at https://hud.ai/models
client = AsyncOpenAI(base_url="https://inference.hud.ai")


async def test_tools_standalone():
    """Test environment tools directly."""
    print("=== Test 1: Standalone Tools ===")

    async with env:
        print(f"Tools: {[t.name for t in env.as_tools()]}")
        for _ in range(3):
            print(await env.call_tool("act"))


async def test_scenario_manual():
    """Test scenario with manual OpenAI calls."""
    print("\n=== Test 2: Scenario (Manual Agent Loop) ===")

    task = env("count-to", target=3)

    async with hud.eval(task) as ctx:
        messages = [{"role": "user", "content": ctx.prompt}]

        while True:
            response = await client.chat.completions.create(
                model="gpt-4o",  # https://hud.ai/models
                messages=messages,
                tools=ctx.as_openai_chat_tools(),
            )
            msg = response.choices[0].message

            if not msg.tool_calls:
                break

            messages.append(msg)
            for tc in msg.tool_calls:
                result = await ctx.call_tool(tc)
                messages.append(result)


async def test_scenario_agent():
    """Test scenario with OpenAIChatAgent."""
    print("\n=== Test 3: Scenario (Agent) ===")

    task = env("count-to", target=5)

    async with hud.eval(task) as ctx:
        agent = OpenAIChatAgent.create(model="gpt-4o")  # https://hud.ai/models
        await agent.run(ctx)


async def test_distribution():
    """Test multiple tasks with variants and groups for A/B testing."""
    print("\n=== Test 4: Distribution (Variants + Groups) ===")

    tasks = [env("count-to", target=3), env("count-to", target=5)]
    variants = {"model": ["gpt-4o-mini", "gpt-4o"]}
    group = 2

    async with hud.eval(tasks, variants=variants, group=group) as ctx:
        agent = OpenAIChatAgent.create(model=ctx.variants["model"])
        await agent.run(ctx, max_steps=10)


async def main():
    await test_tools_standalone()
    await test_scenario_manual()
    await test_scenario_agent()
    await test_distribution()


if __name__ == "__main__":
    asyncio.run(main())