Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 53 additions & 36 deletions examples/run_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,7 @@
from hud.agents import ClaudeAgent, OperatorAgent
from hud.agents.misc.response_agent import ResponseAgent
from hud.clients import MCPClient
from hud.datasets import (
Task,
run_dataset,
run_dataset_parallel,
run_dataset_parallel_manual
)
from hud.datasets import Task, run_dataset, run_dataset_parallel, run_dataset_parallel_manual

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -137,7 +132,7 @@ async def run_full_dataset(
max_concurrent_per_worker: int = 25,
) -> list[Any]:
"""Run evaluation across the entire dataset.

Uses either asyncio-based run_dataset or process-based run_dataset_parallel
depending on the parallel flag.
"""
Expand All @@ -157,7 +152,7 @@ async def run_full_dataset(
}

eval_name = f"Evaluation {dataset_name.split('/')[-1]}"

if parallel:
print(f"🚀 Running PARALLEL evaluation (workers: {max_workers or 'auto'})…")
if max_workers is None:
Expand Down Expand Up @@ -214,44 +209,63 @@ def parse_args() -> argparse.Namespace: # type: ignore[valid-type]
%(prog)s hud-evals/LargeDataset --full --parallel # Large dataset (100+ tasks)
""",
)

parser.add_argument("dataset", help="HuggingFace dataset ID")
parser.add_argument("--full", action="store_true", help="Run entire dataset")

# Agent
parser.add_argument("--agent", choices=["claude", "openai"], default="claude")
parser.add_argument("--model", default=None, help="Model override")
parser.add_argument("--allowed-tools", dest="allowed_tools", help="Tool allowlist (comma-separated)")

parser.add_argument(
"--allowed-tools", dest="allowed_tools", help="Tool allowlist (comma-separated)"
)

# Concurrency
parser.add_argument("--max-concurrent", dest="max_concurrent", type=int, default=50,
help="Max concurrent tasks (default: 50)")

parser.add_argument(
"--max-concurrent",
dest="max_concurrent",
type=int,
default=50,
help="Max concurrent tasks (default: 50)",
)

# Task settings
parser.add_argument("--max-steps", dest="max_steps", type=int, default=10,
help="Max steps per task (default: 10)")

parser.add_argument(
"--max-steps",
dest="max_steps",
type=int,
default=10,
help="Max steps per task (default: 10)",
)

# Parallel mode (100+ tasks)
parser.add_argument("--parallel", action="store_true", help="Use parallel execution for large datasets")
parser.add_argument(
"--parallel", action="store_true", help="Use parallel execution for large datasets"
)
parser.add_argument("--max-workers", dest="max_workers", type=int, help="Worker processes")
parser.add_argument("--max-concurrent-per-worker", dest="max_concurrent_per_worker", type=int, default=25,
help="Max concurrent tasks per worker")

parser.add_argument(
"--max-concurrent-per-worker",
dest="max_concurrent_per_worker",
type=int,
default=25,
help="Max concurrent tasks per worker",
)

# Logging
parser.add_argument("--verbose", "-v", action="store_true", help="Show detailed agent step logs")

parser.add_argument(
"--verbose", "-v", action="store_true", help="Show detailed agent step logs"
)

return parser.parse_args()


async def main() -> None:
args = parse_args()

if args.verbose:
# Detailed logs - show everything including agent steps
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(message)s',
datefmt='%H:%M:%S'
level=logging.INFO, format="%(asctime)s - %(name)s - %(message)s", datefmt="%H:%M:%S"
)
# Ensure HUD agent logs are visible
logging.getLogger("hud.agents").setLevel(logging.INFO)
Expand All @@ -265,8 +279,9 @@ async def main() -> None:

if args.full:
import time

start_time = time.time()

results = await run_full_dataset(
args.dataset,
agent_type=args.agent,
Expand All @@ -278,26 +293,28 @@ async def main() -> None:
max_workers=args.max_workers,
max_concurrent_per_worker=args.max_concurrent_per_worker,
)

elapsed = time.time() - start_time

# Print statistics
print("\n" + "=" * 50)
print("📊 Evaluation Complete!")
print("=" * 50)
print(f"Total tasks: {len(results)}")
print(f"Time elapsed: {elapsed:.2f} seconds")
print(f"Throughput: {len(results)/elapsed:.2f} tasks/second")
print(f"Throughput: {len(results) / elapsed:.2f} tasks/second")

if args.parallel:
print(f"Execution mode: PARALLEL (workers: {args.max_workers or 'auto'})")
else:
print(f"Execution mode: ASYNCIO (max_concurrent: {args.max_concurrent})")

# Count successes
successful = sum(1 for r in results if getattr(r, "reward", 0) > 0)
print(f"Successful tasks: {successful}/{len(results)} ({100*successful/len(results):.1f}%)")

print(
f"Successful tasks: {successful}/{len(results)} ({100 * successful / len(results):.1f}%)"
)

else:
print(f"Execution mode: Single Task (max_steps: {args.max_steps})")
await run_single_task(
Expand Down
13 changes: 11 additions & 2 deletions hud/cli/rl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@ def rl_main(
ctx: typer.Context,
model: str = typer.Option("Qwen/Qwen2.5-3B-Instruct", "--model", "-m", help="Model to train"),
dataset: str | None = typer.Option(
None, "--dataset", "-d", help="Override dataset from lock file"
None,
"--dataset",
"-d",
help="Dataset: JSON file path or HuggingFace name (auto-detects if not provided)",
),
config: Path | None = typer.Option(None, "--config", "-c", help="Config YAML path"), # noqa: B008
gpus: str = typer.Option("2xA100", "--gpus", help="GPU configuration (e.g., 2xA100, 4xH100)"),
Expand All @@ -39,9 +42,15 @@ def rl_main(
3. Push environment to registry if needed
4. Start remote training on Prime Intellect

Dataset can be:
- A local JSON file with tasks (e.g., tasks.json)
- A HuggingFace dataset name (e.g., 'username/dataset-name')
- Auto-detected from current directory if not specified

Examples:
hud rl # Interactive mode with prompts
hud rl # Interactive mode, auto-detect tasks.json
hud rl --model gpt2 # Train with specific model
hud rl --dataset tasks.json # Use local task file
hud rl --gpus 4xH100 # Use different GPU configuration
hud rl init my-env:latest # Generate config for environment
"""
Expand Down
4 changes: 4 additions & 0 deletions hud/cli/rl/pod.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ async def create_and_connect_prime_pod(
image: str,
team_id: str | None = None,
dataset_size: int | None = None,
is_json_file: bool = False,
) -> None:
"""Create a Prime Intellect pod and connect to it for training."""
design.section_title("🌐 Creating Prime Intellect Pod")
Expand Down Expand Up @@ -330,6 +331,7 @@ async def create_and_connect_prime_pod(
output_dir=output_dir,
image=image,
dataset_size=dataset_size,
is_json_file=is_json_file,
)
else:
# Manual fallback
Expand Down Expand Up @@ -457,6 +459,7 @@ async def run_prime_training(
auto_create_pod: str | None = None,
team_id: str | None = None,
dataset_size: int | None = None,
is_json_file: bool = False,
) -> None:
"""Run training on Prime Intellect infrastructure."""
# Check API key
Expand Down Expand Up @@ -488,4 +491,5 @@ async def run_prime_training(
image=image,
team_id=team_id,
dataset_size=dataset_size,
is_json_file=is_json_file,
)
36 changes: 34 additions & 2 deletions hud/cli/rl/ssh.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ async def connect_and_train(
output_dir: Path,
image: str,
dataset_size: int | None = None,
is_json_file: bool = False,
) -> None:
"""Connect to the pod via SSH and run training commands."""
design.section_title("🚀 Starting Remote Training")
Expand Down Expand Up @@ -175,6 +176,37 @@ async def connect_and_train(
design.info("Make sure scp is installed and in your PATH")
raise typer.Exit(1) from e

# If dataset is a JSON file, copy it too
remote_dataset = dataset # Default to unchanged
if is_json_file:
design.info("Copying task file to pod...")
try:
# On Windows, we need to ensure proper path formatting
dataset_path = str(dataset).replace("\\", "/")
# Extract just the filename for the remote path
dataset_filename = os.path.basename(dataset)
remote_dataset = f"/root/{dataset_filename}"

scp_cmd = [
"scp",
"-i",
str(ssh_key_path),
"-P",
ssh_port,
"-o",
"StrictHostKeyChecking=no",
"-o",
"UserKnownHostsFile=/dev/null",
dataset_path,
f"{ssh_user_host}:{remote_dataset}",
]
design.debug(f"Running: {' '.join(scp_cmd)}")
subprocess.run(scp_cmd, check=True) # noqa: S603, ASYNC221
design.success(f"Task file copied to {remote_dataset}")
except subprocess.CalledProcessError as e:
design.error(f"Failed to copy task file: {e}")
raise typer.Exit(1) from e

design.info("Setting up environment and starting training...")
design.info("This will take a few minutes for initial setup, then training will begin.")
design.info("")
Expand All @@ -196,7 +228,7 @@ async def connect_and_train(
"# Load environment",
"env = vf.load_environment(",
' env_id="hud-vf-gym",',
f' taskset="{dataset}",',
f' taskset="{remote_dataset}",',
' config_path="/root/config.yaml",',
f" num_tasks={dataset_size},",
")",
Expand Down Expand Up @@ -242,7 +274,7 @@ async def connect_and_train(
"uv venv --python 3.12 && "
"source .venv/bin/activate && "
# Install packages
"prime env install hud/hud-vf-gym@0.1.0 && "
"prime env install hud/hud-vf-gym@0.1.1 && "
"uv pip install 'verifiers[train]' && "
"uv pip install flash-attn --no-build-isolation && "
# Set environment variables
Expand Down
Loading
Loading