NVIDIA · shuyixiong · Dec 24, 2025 · Dec 12, 2025 · Dec 16, 2025 · Dec 16, 2025
@@ -3023,6 +3023,7 @@ def launchTestJobs(pipeline, testFilter)
         "DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
         "DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
         "DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
+        "DGX_H100-4_GPUs-PyTorch-Ray-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
         "B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1],
         "DGX_B200-4_GPUs-PyTorch-1": ["b200-x4", "l0_dgx_b200", 1, 1, 4],
         "DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true],

@@ -91,6 +91,8 @@ def setup_distributed_env_and_worker(self, port: int):
                                              store=self.store,
                                              world_size=self.world_size,
                                              rank=self.rank)
+        assert torch.distributed.get_world_size(
+        ) == self.world_size, "Process group world size must match the expected world size"
         logger.info(
             f"[Rank {self.rank}] Finished PG init. Global GPU ID: {self.gpu}, local GPU ID: {self.local_gpu}"
         )

diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py
@@ -37,7 +37,6 @@
 import yaml
 from _pytest.mark import ParameterSet
 
-from tensorrt_llm._utils import mpi_disabled
 from tensorrt_llm.bindings import ipc_nvls_supported
 from tensorrt_llm.llmapi.mpi_session import get_mpi_world_size
 
@@ -2362,6 +2361,7 @@ def pytest_configure(config):
     tqdm.tqdm.monitor_interval = 0
     if config.getoption("--run-ray"):
         os.environ["TLLM_DISABLE_MPI"] = "1"
+        os.environ["TLLM_RAY_FORCE_LOCAL_CLUSTER"] = "1"
 
     # Initialize PeriodicJUnitXML reporter if enabled
     periodic = config.getoption("--periodic-junit", default=False)
@@ -2825,15 +2825,3 @@ def torch_empty_cache() -> None:
         gc.collect()
         torch.cuda.empty_cache()
         gc.collect()
-
-
-@pytest.fixture(autouse=True)
-def ray_cleanup(llm_venv) -> None:
-    yield
-
-    if mpi_disabled():
-        llm_venv.run_cmd([
-            "-m",
-            "ray.scripts.scripts",
-            "stop",
-        ])
diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py
@@ -21,6 +21,12 @@
 from typing import Callable
 
 import pytest
+
+try:
+    import ray
+except ImportError:
+    import tensorrt_llm.ray_stub as ray
+
 import yaml
 from defs.common import (revise_disagg_config_file_with_free_ports,
                          wait_for_server)
@@ -30,7 +36,7 @@
 from test_common.perf_metrics_utils import (get_timing_metrics,
                                             validate_timing_metrics)
 
-from tensorrt_llm._utils import get_free_port, mpi_disabled
+from tensorrt_llm._utils import mpi_disabled
 from tensorrt_llm.logger import logger
 
 
@@ -357,8 +363,6 @@ def run_disaggregated_test(example_dir,
 
         extra_config_files = []
         workers_cmds = []
-        subprocess.run(['ray', 'start', '--head', '--disable-usage-stats'],
-                       check=True)
 
         # Generate ctx and gen server worker commands
         ctx_extra_config_file = get_extra_llm_config(config['context_servers'],
@@ -415,6 +419,21 @@ def run_disaggregated_test(example_dir,
                                  use_ray=False)
 
         else:
+            runtime_env = {
+                "env_vars": {
+                    "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES": "1"
+                }
+            }
+            ray.init(address="local",
+                     include_dashboard=False,
+                     ignore_reinit_error=True,
+                     runtime_env=runtime_env)
+            gcs_addr = ray.get_runtime_context().gcs_address
+            ray_port = str(gcs_addr.split(":")[1])
+            run_env.update({
+                "RAY_ADDRESS": f"localhost:{ray_port}",
+                "TLLM_RAY_FORCE_LOCAL_CLUSTER": "0"
+            })
             workers_proc = []
             with contextlib.ExitStack() as stack:
                 workers_log = stack.enter_context(
@@ -470,16 +489,16 @@ def run_disaggregated_test(example_dir,
             logger.error(f.read())
         raise
     finally:
-        if use_ray:
-            subprocess.run(['ray', 'stop', '--force'], check=False)
-            for extra_file in extra_config_files:
-                if os.path.exists(extra_file):
-                    os.remove(extra_file)
-        elif 'server_proc' in locals() and 'workers_proc' in locals():
+        if 'server_proc' in locals() and 'workers_proc' in locals():
             server_proc.terminate()
             workers_proc.terminate()
             server_proc.wait()
             workers_proc.wait()
+        if use_ray:
+            ray.shutdown()
+            for extra_file in extra_config_files:
+                if os.path.exists(extra_file):
+                    os.remove(extra_file)
 
 
 @pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],

diff --git a/tests/integration/defs/examples/test_ray.py b/tests/integration/defs/examples/test_ray.py
@@ -1,9 +1,15 @@
 import os
 import subprocess
 
+try:
+    import ray
+except ImportError:
+    import tensorrt_llm.ray_stub as ray
+
 import pytest
 from defs.common import venv_check_call, wait_for_server
 from defs.conftest import get_device_count, llm_models_root
+from defs.trt_test_alternative import popen
 
 
 @pytest.fixture(scope="module")
@@ -65,48 +71,57 @@ def test_ray_disaggregated_serving(ray_example_root, llm_venv, tp_size):
     disagg_dir = os.path.join(ray_example_root, "disaggregated")
     script_path = os.path.join(disagg_dir, "disagg_serving_local.sh")
     model_dir = f"{llm_models_root()}/llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
-    subprocess.run("ray stop --force", shell=True, check=False)
-
-    proc = subprocess.Popen(
-        [
-            "bash", script_path, "--executor", "ray", "--model", model_dir,
-            "--tp_size",
-            str(tp_size)
-        ],
-        cwd=disagg_dir,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-    )
-    try:
-        assert wait_for_server("localhost", 8000, timeout_seconds=180), \
-            "Disaggregated server failed to start within 3 minutes"
-
-        result = subprocess.run([
-            "curl", "-sS", "-w", "\n%{http_code}",
-            "http://localhost:8000/v1/completions", "-H",
-            "Content-Type: application/json", "-d",
-            '{"model":"TinyLlama-1.1B-Chat-v1.0","prompt":"NVIDIA is a great company because","max_tokens":16,"temperature":0}'
-        ],
-                                capture_output=True,
-                                text=True,
-                                timeout=30)
-
-        *body_lines, status_line = result.stdout.strip().splitlines()
-        body = "\n".join(body_lines)
-        status = int(status_line)
-
-        print("HTTP status:", status)
-        print("Response body:", body)
-
-        assert result.returncode == 0, f"curl exit {result.returncode}"
-        assert status == 200, f"Expected 200, got {status}"
 
+    try:
+        runtime_env = {
+            "env_vars": {
+                "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES": "1"
+            }
+        }
+        ray.init(address="local",
+                 include_dashboard=False,
+                 ignore_reinit_error=True,
+                 runtime_env=runtime_env)
+        gcs_addr = ray.get_runtime_context().gcs_address
+        ray_port = str(gcs_addr.split(":")[1])
+
+        env_copy = os.environ.copy()
+        env_copy.update({
+            "RAY_ADDRESS": f"localhost:{ray_port}",
+            "TLLM_RAY_FORCE_LOCAL_CLUSTER": "0"
+        })
+        with popen(
+            [
+                "bash", script_path, "--executor", "ray", "--attach", "--model",
+                model_dir, "--tp_size",
+                str(tp_size)
+            ],
+                cwd=disagg_dir,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                env=env_copy,
+        ):
+            assert wait_for_server("localhost", 8000, timeout_seconds=180), \
+                "Disaggregated server failed to start within 3 minutes"
+
+            result = subprocess.run([
+                "curl", "-sS", "-w", "\n%{http_code}",
+                "http://localhost:8000/v1/completions", "-H",
+                "Content-Type: application/json", "-d",
+                '{"model":"TinyLlama-1.1B-Chat-v1.0","prompt":"NVIDIA is a great company because","max_tokens":16,"temperature":0}'
+            ],
+                                    capture_output=True,
+                                    text=True,
+                                    timeout=30)
+
+            *body_lines, status_line = result.stdout.strip().splitlines()
+            body = "\n".join(body_lines)
+            status = int(status_line)
+
+            print("HTTP status:", status)
+            print("Response body:", body)
+
+            assert result.returncode == 0, f"curl exit {result.returncode}"
+            assert status == 200, f"Expected 200, got {status}"
     finally:
-        proc.terminate()
-        try:
-            proc.wait(timeout=10)
-        except Exception:
-            proc.kill()
-
-        subprocess.run("ray stop --force", shell=True, check=False)
-        subprocess.run("pkill -9 -f trtllm-serve", shell=True, check=False)
+        ray.shutdown()
diff --git a/tests/integration/defs/ray_orchestrator/RL/README.md b/tests/integration/defs/ray_orchestrator/RL/README.md
@@ -0,0 +1,59 @@
+# RL Framework Integration Tests
+
+This directory contains integration tests for TensorRT-LLM with [Ray orchestrator](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/ray_orchestrator), specifically designed to cover usage patterns from various RL (Reinforcement Learning) frameworks such as VeRL and NeMo RL. 
+
+## Available Scripts
+
+| Script | Description |
+|--------|-------------|
+| `run_rl_perf_reproduce.py` | Emulates RL workload performance with multiple AsyncLLM instances distributed across GPUs using Ray placement groups |
+
+## Usage Examples
+
+### RL Performance Reproduction
+
+The `run_rl_perf_reproduce.py` script creates multiple TensorRT-LLM instances in parallel to simulate RL rollout workloads.
+
+**TP=4 with 2 instances (8 GPUs total):**
+
+```bash
+python run_rl_perf_reproduce.py \
+    --model_dir /path/to/model_dir \
+    --data_path /path/to/prompts.json \
+    --num_instances 2 \
+    --tp_size 4 \
+    --top_p 1 \
+    --logprobs 1 \
+    --max_batch_size 1024 \
+    --enable_cuda_graph_padding
+```
+
+**TP=1 with 8 instances (8 GPUs total):**
+
+```bash
+python run_rl_perf_reproduce.py \
+    --model_dir /path/to/model_dir \
+    --data_path /path/to/prompts.json \
+    --num_instances 8 \
+    --tp_size 1 \
+    --top_p 1 \
+    --logprobs 1 \
+    --max_batch_size 384 \
+    --enable_cuda_graph_padding
+```
+
+## Data Format
+
+The `--data_path` should point to a JSON file containing a list of prompts, where each prompt is a list of token IDs:
+
+```json
+[
+    [1, 2345, 6789, ...],
+    [1, 3456, 7890, ...],
+    ...
+]
+```
+
+## Notes
+
+- RL Perf reproduction scripts support single-node execution only (max 8 GPUs)