hud-evals · lorenss-m · Aug 31, 2025 · Aug 30, 2025 · Aug 30, 2025 · Aug 30, 2025
diff --git a/examples/run_evaluation.py b/examples/run_evaluation.py
@@ -43,12 +43,7 @@
 from hud.agents import ClaudeAgent, OperatorAgent
 from hud.agents.misc.response_agent import ResponseAgent
 from hud.clients import MCPClient
-from hud.datasets import (
-    Task, 
-    run_dataset, 
-    run_dataset_parallel, 
-    run_dataset_parallel_manual
-)
+from hud.datasets import Task, run_dataset, run_dataset_parallel, run_dataset_parallel_manual
 
 logger = logging.getLogger(__name__)
 
@@ -137,7 +132,7 @@ async def run_full_dataset(
     max_concurrent_per_worker: int = 25,
 ) -> list[Any]:
     """Run evaluation across the entire dataset.
-    
+
     Uses either asyncio-based run_dataset or process-based run_dataset_parallel
     depending on the parallel flag.
     """
@@ -157,7 +152,7 @@ async def run_full_dataset(
         }
 
     eval_name = f"Evaluation {dataset_name.split('/')[-1]}"
-    
+
     if parallel:
         print(f"🚀 Running PARALLEL evaluation (workers: {max_workers or 'auto'})…")
         if max_workers is None:
@@ -214,44 +209,63 @@ def parse_args() -> argparse.Namespace:  # type: ignore[valid-type]
   %(prog)s hud-evals/LargeDataset --full --parallel   # Large dataset (100+ tasks)
         """,
     )
-    
+
     parser.add_argument("dataset", help="HuggingFace dataset ID")
     parser.add_argument("--full", action="store_true", help="Run entire dataset")
-    
+
     # Agent
     parser.add_argument("--agent", choices=["claude", "openai"], default="claude")
     parser.add_argument("--model", default=None, help="Model override")
-    parser.add_argument("--allowed-tools", dest="allowed_tools", help="Tool allowlist (comma-separated)")
-
+    parser.add_argument(
+        "--allowed-tools", dest="allowed_tools", help="Tool allowlist (comma-separated)"
+    )
+
     # Concurrency
-    parser.add_argument("--max-concurrent", dest="max_concurrent", type=int, default=50, 
-                        help="Max concurrent tasks (default: 50)")
-
+    parser.add_argument(
+        "--max-concurrent",
+        dest="max_concurrent",
+        type=int,
+        default=50,
+        help="Max concurrent tasks (default: 50)",
+    )
+
     # Task settings
-    parser.add_argument("--max-steps", dest="max_steps", type=int, default=10,
-                        help="Max steps per task (default: 10)")
-
+    parser.add_argument(
+        "--max-steps",
+        dest="max_steps",
+        type=int,
+        default=10,
+        help="Max steps per task (default: 10)",
+    )
+
     # Parallel mode (100+ tasks)
-    parser.add_argument("--parallel", action="store_true", help="Use parallel execution for large datasets")
+    parser.add_argument(
+        "--parallel", action="store_true", help="Use parallel execution for large datasets"
+    )
     parser.add_argument("--max-workers", dest="max_workers", type=int, help="Worker processes")
-    parser.add_argument("--max-concurrent-per-worker", dest="max_concurrent_per_worker", type=int, default=25,
-                        help="Max concurrent tasks per worker")
-
+    parser.add_argument(
+        "--max-concurrent-per-worker",
+        dest="max_concurrent_per_worker",
+        type=int,
+        default=25,
+        help="Max concurrent tasks per worker",
+    )
+
     # Logging
-    parser.add_argument("--verbose", "-v", action="store_true", help="Show detailed agent step logs")
-
+    parser.add_argument(
+        "--verbose", "-v", action="store_true", help="Show detailed agent step logs"
+    )
+
     return parser.parse_args()
 
 
 async def main() -> None:
     args = parse_args()
-    
+
     if args.verbose:
         # Detailed logs - show everything including agent steps
         logging.basicConfig(
-            level=logging.INFO,
-            format='%(asctime)s - %(name)s - %(message)s',
-            datefmt='%H:%M:%S'
+            level=logging.INFO, format="%(asctime)s - %(name)s - %(message)s", datefmt="%H:%M:%S"
         )
         # Ensure HUD agent logs are visible
         logging.getLogger("hud.agents").setLevel(logging.INFO)
@@ -265,8 +279,9 @@ async def main() -> None:
 
     if args.full:
         import time
+
         start_time = time.time()
-        
+
         results = await run_full_dataset(
             args.dataset,
             agent_type=args.agent,
@@ -278,26 +293,28 @@ async def main() -> None:
             max_workers=args.max_workers,
             max_concurrent_per_worker=args.max_concurrent_per_worker,
         )
-        
+
         elapsed = time.time() - start_time
-        
+
         # Print statistics
         print("\n" + "=" * 50)
         print("📊 Evaluation Complete!")
         print("=" * 50)
         print(f"Total tasks: {len(results)}")
         print(f"Time elapsed: {elapsed:.2f} seconds")
-        print(f"Throughput: {len(results)/elapsed:.2f} tasks/second")
-        
+        print(f"Throughput: {len(results) / elapsed:.2f} tasks/second")
+
         if args.parallel:
             print(f"Execution mode: PARALLEL (workers: {args.max_workers or 'auto'})")
         else:
             print(f"Execution mode: ASYNCIO (max_concurrent: {args.max_concurrent})")
-        
+
         # Count successes
         successful = sum(1 for r in results if getattr(r, "reward", 0) > 0)
-        print(f"Successful tasks: {successful}/{len(results)} ({100*successful/len(results):.1f}%)")
-
+        print(
+            f"Successful tasks: {successful}/{len(results)} ({100 * successful / len(results):.1f}%)"
+        )
+
     else:
         print(f"Execution mode: Single Task (max_steps: {args.max_steps})")
         await run_single_task(

diff --git a/hud/cli/rl/__init__.py b/hud/cli/rl/__init__.py
@@ -23,7 +23,10 @@ def rl_main(
     ctx: typer.Context,
     model: str = typer.Option("Qwen/Qwen2.5-3B-Instruct", "--model", "-m", help="Model to train"),
     dataset: str | None = typer.Option(
-        None, "--dataset", "-d", help="Override dataset from lock file"
+        None,
+        "--dataset",
+        "-d",
+        help="Dataset: JSON file path or HuggingFace name (auto-detects if not provided)",
     ),
     config: Path | None = typer.Option(None, "--config", "-c", help="Config YAML path"),  # noqa: B008
     gpus: str = typer.Option("2xA100", "--gpus", help="GPU configuration (e.g., 2xA100, 4xH100)"),
@@ -39,9 +42,15 @@ def rl_main(
     3. Push environment to registry if needed
     4. Start remote training on Prime Intellect
 
+    Dataset can be:
+    - A local JSON file with tasks (e.g., tasks.json)
+    - A HuggingFace dataset name (e.g., 'username/dataset-name')
+    - Auto-detected from current directory if not specified
+
     Examples:
-        hud rl                    # Interactive mode with prompts
+        hud rl                    # Interactive mode, auto-detect tasks.json
         hud rl --model gpt2       # Train with specific model
+        hud rl --dataset tasks.json  # Use local task file
         hud rl --gpus 4xH100      # Use different GPU configuration
         hud rl init my-env:latest # Generate config for environment
     """

diff --git a/hud/cli/rl/pod.py b/hud/cli/rl/pod.py
@@ -62,6 +62,7 @@ async def create_and_connect_prime_pod(
     image: str,
     team_id: str | None = None,
     dataset_size: int | None = None,
+    is_json_file: bool = False,
 ) -> None:
     """Create a Prime Intellect pod and connect to it for training."""
     design.section_title("🌐 Creating Prime Intellect Pod")
@@ -330,6 +331,7 @@ async def create_and_connect_prime_pod(
                     output_dir=output_dir,
                     image=image,
                     dataset_size=dataset_size,
+                    is_json_file=is_json_file,
                 )
             else:
                 # Manual fallback
@@ -457,6 +459,7 @@ async def run_prime_training(
     auto_create_pod: str | None = None,
     team_id: str | None = None,
     dataset_size: int | None = None,
+    is_json_file: bool = False,
 ) -> None:
     """Run training on Prime Intellect infrastructure."""
     # Check API key
@@ -488,4 +491,5 @@ async def run_prime_training(
         image=image,
         team_id=team_id,
         dataset_size=dataset_size,
+        is_json_file=is_json_file,
     )
diff --git a/hud/cli/rl/ssh.py b/hud/cli/rl/ssh.py
@@ -101,6 +101,7 @@ async def connect_and_train(
     output_dir: Path,
     image: str,
     dataset_size: int | None = None,
+    is_json_file: bool = False,
 ) -> None:
     """Connect to the pod via SSH and run training commands."""
     design.section_title("🚀 Starting Remote Training")
@@ -175,6 +176,37 @@ async def connect_and_train(
             design.info("Make sure scp is installed and in your PATH")
         raise typer.Exit(1) from e
 
+    # If dataset is a JSON file, copy it too
+    remote_dataset = dataset  # Default to unchanged
+    if is_json_file:
+        design.info("Copying task file to pod...")
+        try:
+            # On Windows, we need to ensure proper path formatting
+            dataset_path = str(dataset).replace("\\", "/")
+            # Extract just the filename for the remote path
+            dataset_filename = os.path.basename(dataset)
+            remote_dataset = f"/root/{dataset_filename}"
+
+            scp_cmd = [
+                "scp",
+                "-i",
+                str(ssh_key_path),
+                "-P",
+                ssh_port,
+                "-o",
+                "StrictHostKeyChecking=no",
+                "-o",
+                "UserKnownHostsFile=/dev/null",
+                dataset_path,
+                f"{ssh_user_host}:{remote_dataset}",
+            ]
+            design.debug(f"Running: {' '.join(scp_cmd)}")
+            subprocess.run(scp_cmd, check=True)  # noqa: S603, ASYNC221
+            design.success(f"Task file copied to {remote_dataset}")
+        except subprocess.CalledProcessError as e:
+            design.error(f"Failed to copy task file: {e}")
+            raise typer.Exit(1) from e
+
     design.info("Setting up environment and starting training...")
     design.info("This will take a few minutes for initial setup, then training will begin.")
     design.info("")
@@ -196,7 +228,7 @@ async def connect_and_train(
         "# Load environment",
         "env = vf.load_environment(",
         '    env_id="hud-vf-gym",',
-        f'    taskset="{dataset}",',
+        f'    taskset="{remote_dataset}",',
         '    config_path="/root/config.yaml",',
         f"    num_tasks={dataset_size},",
         ")",
@@ -242,7 +274,7 @@ async def connect_and_train(
         "uv venv --python 3.12 && "
         "source .venv/bin/activate && "
         # Install packages
-        "prime env install hud/hud-vf-gym@0.1.0 && "
+        "prime env install hud/hud-vf-gym@0.1.1 && "
         "uv pip install 'verifiers[train]' && "
         "uv pip install flash-attn --no-build-isolation && "
         # Set environment variables