NVIDIA · shuyixiong · Dec 24, 2025 · Dec 12, 2025 · Dec 16, 2025 · Dec 16, 2025
@@ -3023,6 +3023,7 @@ def launchTestJobs(pipeline, testFilter)
         "DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
         "DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
         "DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
+        "DGX_H100-4_GPUs-PyTorch-Ray-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
         "B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1],
         "DGX_B200-4_GPUs-PyTorch-1": ["b200-x4", "l0_dgx_b200", 1, 1, 4],
         "DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true],

@@ -91,6 +91,8 @@ def setup_distributed_env_and_worker(self, port: int):
                                              store=self.store,
                                              world_size=self.world_size,
                                              rank=self.rank)
+        assert torch.distributed.get_world_size(
+        ) == self.world_size, "Process group world size must match the expected world size"
         logger.info(
             f"[Rank {self.rank}] Finished PG init. Global GPU ID: {self.gpu}, local GPU ID: {self.local_gpu}"
         )

diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py
@@ -2362,6 +2362,7 @@ def pytest_configure(config):
     tqdm.tqdm.monitor_interval = 0
     if config.getoption("--run-ray"):
         os.environ["TLLM_DISABLE_MPI"] = "1"
+        os.environ["TLLM_RAY_FORCE_LOCAL_CLUSTER"] = "1"
 
     # Initialize PeriodicJUnitXML reporter if enabled
     periodic = config.getoption("--periodic-junit", default=False)

@@ -22,15 +22,16 @@
 
 import pytest
 import yaml
-from defs.common import (revise_disagg_config_file_with_free_ports,
+from defs.common import (get_free_port_in_ci,
+                         revise_disagg_config_file_with_free_ports,
                          wait_for_server)
 from defs.conftest import (get_sm_version, llm_models_root, skip_arm,
                            skip_no_hopper)
 from defs.trt_test_alternative import check_call, check_output, popen
 from test_common.perf_metrics_utils import (get_timing_metrics,
                                             validate_timing_metrics)
 
-from tensorrt_llm._utils import get_free_port, mpi_disabled
+from tensorrt_llm._utils import mpi_disabled
 from tensorrt_llm.logger import logger
 
 
@@ -357,8 +358,14 @@ def run_disaggregated_test(example_dir,
 
         extra_config_files = []
         workers_cmds = []
-        subprocess.run(['ray', 'start', '--head', '--disable-usage-stats'],
+        ray_port = get_free_port_in_ci()
+        subprocess.run([
+            'ray', 'start', '--head', '--port',
+            str(ray_port), '--disable-usage-stats'
+        ],
                        check=True)
+        run_env["RAY_ADDRESS"] = f"localhost:{ray_port}"
+        run_env["TLLM_RAY_FORCE_LOCAL_CLUSTER"] = "0"
 
         # Generate ctx and gen server worker commands
         ctx_extra_config_file = get_extra_llm_config(config['context_servers'],

@@ -2,7 +2,7 @@
 import subprocess
 
 import pytest
-from defs.common import venv_check_call, wait_for_server
+from defs.common import get_free_port_in_ci, venv_check_call, wait_for_server
 from defs.conftest import get_device_count, llm_models_root
 
 
@@ -66,16 +66,24 @@ def test_ray_disaggregated_serving(ray_example_root, llm_venv, tp_size):
     script_path = os.path.join(disagg_dir, "disagg_serving_local.sh")
     model_dir = f"{llm_models_root()}/llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
     subprocess.run("ray stop --force", shell=True, check=False)
-
+    ray_port = get_free_port_in_ci()
+    subprocess.run(f"ray start --head --port {ray_port} --disable-usage-stats",
+                   shell=True,
+                   check=False)
+
+    env_copy = os.environ.copy()
+    env_copy.update({"RAY_ADDRESS": f"localhost:{ray_port}"},
+                    {"TLLM_RAY_FORCE_LOCAL_CLUSTER": "0"})
     proc = subprocess.Popen(
         [
-            "bash", script_path, "--executor", "ray", "--model", model_dir,
-            "--tp_size",
+            "bash", script_path, "--executor", "ray", "--attach", "--model",
+            model_dir, "--tp_size",
             str(tp_size)
         ],
         cwd=disagg_dir,
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
+        env=env_copy,
     )
     try:
         assert wait_for_server("localhost", 8000, timeout_seconds=180), \

diff --git a/tests/integration/defs/ray_orchestrator/RL/README.md b/tests/integration/defs/ray_orchestrator/RL/README.md
@@ -0,0 +1,59 @@
+# RL Framework Integration Tests
+
+This directory contains integration tests for TensorRT-LLM with [Ray orchestrator](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/ray_orchestrator), specifically designed to cover usage patterns from various RL (Reinforcement Learning) frameworks such as VeRL and NeMo RL. 
+
+## Available Scripts
+
+| Script | Description |
+|--------|-------------|
+| `run_rl_perf_reproduce.py` | Emulates RL workload performance with multiple AsyncLLM instances distributed across GPUs using Ray placement groups |
+
+## Usage Examples
+
+### RL Performance Reproduction
+
+The `run_rl_perf_reproduce.py` script creates multiple TensorRT-LLM instances in parallel to simulate RL rollout workloads.
+
+**TP=4 with 2 instances (8 GPUs total):**
+
+```bash
+python run_rl_perf_reproduce.py \
+    --model_dir /path/to/model_dir \
+    --data_path /path/to/prompts.json \
+    --num_instances 2 \
+    --tp_size 4 \
+    --top_p 1 \
+    --logprobs 1 \
+    --max_batch_size 1024 \
+    --enable_cuda_graph_padding
+```
+
+**TP=1 with 8 instances (8 GPUs total):**
+
+```bash
+python run_rl_perf_reproduce.py \
+    --model_dir /path/to/model_dir \
+    --data_path /path/to/prompts.json \
+    --num_instances 8 \
+    --tp_size 1 \
+    --top_p 1 \
+    --logprobs 1 \
+    --max_batch_size 384 \
+    --enable_cuda_graph_padding
+```
+
+## Data Format
+
+The `--data_path` should point to a JSON file containing a list of prompts, where each prompt is a list of token IDs:
+
+```json
+[
+    [1, 2345, 6789, ...],
+    [1, 3456, 7890, ...],
+    ...
+]
+```
+
+## Notes
+
+- RL Perf reproduction scripts support single-node execution only (max 8 GPUs)