Move rl_perf_reproduce.py to tests/ dir and add functional test

shuyixiong · shuyixiong · commit 3a0721dadec9 · 2025-12-17T01:43:17.000-08:00
Signed-off-by: Shuyi Xiong &lt;219646547+shuyixiong@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/ray_orchestrator/RL/README.md b/tests/integration/defs/ray_orchestrator/RL/README.md
@@ -0,0 +1,59 @@
+# RL Framework Integration Tests
+
+This directory contains integration tests for TensorRT-LLM with Ray orchestrator, specifically designed to cover usage patterns from various RL (Reinforcement Learning) frameworks.
+
+## Available Scripts
+
+| Script | Description |
+|--------|-------------|
+| `run_rl_perf_reproduce.py` | Emulates RL workload performance with multiple AsyncLLM instances distributed across GPUs using Ray placement groups |
+
+## Usage Examples
+
+### RL Performance Reproduction
+
+The `run_rl_perf_reproduce.py` script creates multiple TensorRT-LLM instances in parallel to simulate RL rollout workloads.
+
+**TP=4 with 2 instances (8 GPUs total):**
+
+```bash
+python run_rl_perf_reproduce.py \
+    --model_dir /path/to/model_dir \
+    --data_path /path/to/prompts.json \
+    --num_instances 2 \
+    --tp_size 4 \
+    --top_p 1 \
+    --logprobs 1 \
+    --max_batch_size 1024 \
+    --enable_cuda_graph_padding
+```
+
+**TP=1 with 8 instances (8 GPUs total):**
+
+```bash
+python run_rl_perf_reproduce.py \
+    --model_dir /path/to/model_dir \
+    --data_path /path/to/prompts.json \
+    --num_instances 8 \
+    --tp_size 1 \
+    --top_p 1 \
+    --logprobs 1 \
+    --max_batch_size 384 \
+    --enable_cuda_graph_padding
+```
+
+## Data Format
+
+The `--data_path` should point to a JSON file containing a list of prompts, where each prompt is a list of token IDs:
+
+```json
+[
+    [1, 2345, 6789, ...],
+    [1, 3456, 7890, ...],
+    ...
+]
+```
+
+## Notes
+
+- RL Perf reproduction scripts support single-node execution only (max 8 GPUs)
diff --git a/tests/integration/defs/ray_orchestrator/RL/run_rl_perf_reproduce.py b/tests/integration/defs/ray_orchestrator/RL/run_rl_perf_reproduce.py
@@ -1,25 +1,3 @@
-##############################################################################
-# OVERVIEW:
-# This script is to emulate the performance of running TensorRT-LLM with Ray
-# orchestrator for Reinforcement Learning (RL) workloads. It creates multiple
-# AsyncLLM instances distributed across GPUs using Ray placement groups,
-# enabling parallel generation for RL training scenarios.
-#
-# EXAMPLE USAGE:
-#   python rl_perf_repro.py \
-#       --model_dir /path/to/model_dir \
-#       --data_path /path/to/prompts.json \
-#       --num_instances 2 \
-#       --tp_size 4 \
-#       --max_batch_size 1024 \
-#       --enable_cuda_graph_padding \
-#       --enable_block_reuse \
-#       --logprobs 1
-#
-# NOTE:
-# - This script supports single-node execution only (max 8 GPUs)
-##############################################################################
-
 import argparse
 import asyncio
 import json
diff --git a/tests/integration/defs/ray_orchestrator/RL/test_rl_perf_reproduce.py b/tests/integration/defs/ray_orchestrator/RL/test_rl_perf_reproduce.py
@@ -0,0 +1,60 @@
+import json
+import tempfile
+from pathlib import Path
+
+import pytest
+from defs.common import venv_check_call
+from defs.conftest import integration_path, llm_models_root
+from transformers import AutoTokenizer
+
+
+@pytest.mark.skip_less_device(4)
+@pytest.mark.parametrize(
+    "tp_size, num_instances", [(2, 2), (1, 4)], ids=["tp2_instances2", "tp1_instances4"]
+)
+def test_rl_perf_reproduce(llm_venv, tp_size, num_instances):
+    script_path = (
+        integration_path() / "defs" / "ray_orchestrator" / "RL" / "run_rl_perf_reproduce.py"
+    )
+    math_txt_path = integration_path() / "test_input_files" / "math.txt"
+    model_dir = f"{llm_models_root()}/Qwen2-7B-Instruct"
+
+    if tp_size == 2:
+        max_batch_size = 512
+    else:
+        max_batch_size = 256
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        prompt_text = "The president of the United States is"
+
+        tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
+        token_ids = tokenizer.encode(prompt_text, add_special_tokens=False)
+
+        # Replicate to create batch of 1024 prompts
+        batch_size = 1024
+        prompts = [token_ids for _ in range(batch_size)]
+
+        data_path = Path(tmpdir) / "prompts.json"
+        with open(data_path, "w") as f:
+            json.dump(prompts, f)
+
+        venv_check_call(
+            llm_venv,
+            [
+                str(script_path),
+                "--model_dir",
+                model_dir,
+                "--data_path",
+                str(data_path),
+                "--num_instances",
+                str(num_instances),
+                "--tp_size",
+                str(tp_size),
+                "--logprobs",
+                "1",
+                "--max_batch_size",
+                str(max_batch_size),
+                "--enable_block_reuse",
+                "--enable_cuda_graph_padding",
+            ],
+        )