Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -3023,6 +3023,7 @@ def launchTestJobs(pipeline, testFilter)
"DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-PyTorch-Ray-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1],
"DGX_B200-4_GPUs-PyTorch-1": ["b200-x4", "l0_dgx_b200", 1, 1, 4],
"DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true],
Expand Down
2 changes: 2 additions & 0 deletions tensorrt_llm/executor/ray_gpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ def setup_distributed_env_and_worker(self, port: int):
store=self.store,
world_size=self.world_size,
rank=self.rank)
assert torch.distributed.get_world_size(
) == self.world_size, "Process group world size must match the expected world size"
logger.info(
f"[Rank {self.rank}] Finished PG init. Global GPU ID: {self.gpu}, local GPU ID: {self.local_gpu}"
)
Expand Down
14 changes: 1 addition & 13 deletions tests/integration/defs/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
import yaml
from _pytest.mark import ParameterSet

from tensorrt_llm._utils import mpi_disabled
from tensorrt_llm.bindings import ipc_nvls_supported
from tensorrt_llm.llmapi.mpi_session import get_mpi_world_size

Expand Down Expand Up @@ -2362,6 +2361,7 @@ def pytest_configure(config):
tqdm.tqdm.monitor_interval = 0
if config.getoption("--run-ray"):
os.environ["TLLM_DISABLE_MPI"] = "1"
os.environ["TLLM_RAY_FORCE_LOCAL_CLUSTER"] = "1"

# Initialize PeriodicJUnitXML reporter if enabled
periodic = config.getoption("--periodic-junit", default=False)
Expand Down Expand Up @@ -2825,15 +2825,3 @@ def torch_empty_cache() -> None:
gc.collect()
torch.cuda.empty_cache()
gc.collect()


@pytest.fixture(autouse=True)
def ray_cleanup(llm_venv) -> None:
yield

if mpi_disabled():
llm_venv.run_cmd([
"-m",
"ray.scripts.scripts",
"stop",
])
37 changes: 28 additions & 9 deletions tests/integration/defs/disaggregated/test_disaggregated.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@
from typing import Callable

import pytest

try:
import ray
except ImportError:
import tensorrt_llm.ray_stub as ray

import yaml
from defs.common import (revise_disagg_config_file_with_free_ports,
wait_for_server)
Expand All @@ -30,7 +36,7 @@
from test_common.perf_metrics_utils import (get_timing_metrics,
validate_timing_metrics)

from tensorrt_llm._utils import get_free_port, mpi_disabled
from tensorrt_llm._utils import mpi_disabled
from tensorrt_llm.logger import logger


Expand Down Expand Up @@ -357,8 +363,6 @@ def run_disaggregated_test(example_dir,

extra_config_files = []
workers_cmds = []
subprocess.run(['ray', 'start', '--head', '--disable-usage-stats'],
check=True)

# Generate ctx and gen server worker commands
ctx_extra_config_file = get_extra_llm_config(config['context_servers'],
Expand Down Expand Up @@ -415,6 +419,21 @@ def run_disaggregated_test(example_dir,
use_ray=False)

else:
runtime_env = {
"env_vars": {
"RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES": "1"
}
}
ray.init(address="local",
include_dashboard=False,
ignore_reinit_error=True,
runtime_env=runtime_env)
gcs_addr = ray.get_runtime_context().gcs_address
ray_port = str(gcs_addr.split(":")[1])
run_env.update({
"RAY_ADDRESS": f"localhost:{ray_port}",
"TLLM_RAY_FORCE_LOCAL_CLUSTER": "0"
})
workers_proc = []
with contextlib.ExitStack() as stack:
workers_log = stack.enter_context(
Expand Down Expand Up @@ -470,16 +489,16 @@ def run_disaggregated_test(example_dir,
logger.error(f.read())
raise
finally:
if use_ray:
subprocess.run(['ray', 'stop', '--force'], check=False)
for extra_file in extra_config_files:
if os.path.exists(extra_file):
os.remove(extra_file)
elif 'server_proc' in locals() and 'workers_proc' in locals():
if 'server_proc' in locals() and 'workers_proc' in locals():
server_proc.terminate()
workers_proc.terminate()
server_proc.wait()
workers_proc.wait()
if use_ray:
ray.shutdown()
for extra_file in extra_config_files:
if os.path.exists(extra_file):
os.remove(extra_file)


@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
Expand Down
101 changes: 58 additions & 43 deletions tests/integration/defs/examples/test_ray.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
import os
import subprocess

try:
import ray
except ImportError:
import tensorrt_llm.ray_stub as ray

import pytest
from defs.common import venv_check_call, wait_for_server
from defs.conftest import get_device_count, llm_models_root
from defs.trt_test_alternative import popen


@pytest.fixture(scope="module")
Expand Down Expand Up @@ -65,48 +71,57 @@ def test_ray_disaggregated_serving(ray_example_root, llm_venv, tp_size):
disagg_dir = os.path.join(ray_example_root, "disaggregated")
script_path = os.path.join(disagg_dir, "disagg_serving_local.sh")
model_dir = f"{llm_models_root()}/llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
subprocess.run("ray stop --force", shell=True, check=False)

proc = subprocess.Popen(
[
"bash", script_path, "--executor", "ray", "--model", model_dir,
"--tp_size",
str(tp_size)
],
cwd=disagg_dir,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
try:
assert wait_for_server("localhost", 8000, timeout_seconds=180), \
"Disaggregated server failed to start within 3 minutes"

result = subprocess.run([
"curl", "-sS", "-w", "\n%{http_code}",
"http://localhost:8000/v1/completions", "-H",
"Content-Type: application/json", "-d",
'{"model":"TinyLlama-1.1B-Chat-v1.0","prompt":"NVIDIA is a great company because","max_tokens":16,"temperature":0}'
],
capture_output=True,
text=True,
timeout=30)

*body_lines, status_line = result.stdout.strip().splitlines()
body = "\n".join(body_lines)
status = int(status_line)

print("HTTP status:", status)
print("Response body:", body)

assert result.returncode == 0, f"curl exit {result.returncode}"
assert status == 200, f"Expected 200, got {status}"

try:
runtime_env = {
"env_vars": {
"RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES": "1"
}
}
ray.init(address="local",
include_dashboard=False,
ignore_reinit_error=True,
runtime_env=runtime_env)
gcs_addr = ray.get_runtime_context().gcs_address
ray_port = str(gcs_addr.split(":")[1])

env_copy = os.environ.copy()
env_copy.update({
"RAY_ADDRESS": f"localhost:{ray_port}",
"TLLM_RAY_FORCE_LOCAL_CLUSTER": "0"
})
with popen(
[
"bash", script_path, "--executor", "ray", "--attach", "--model",
model_dir, "--tp_size",
str(tp_size)
],
cwd=disagg_dir,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=env_copy,
):
assert wait_for_server("localhost", 8000, timeout_seconds=180), \
"Disaggregated server failed to start within 3 minutes"

result = subprocess.run([
"curl", "-sS", "-w", "\n%{http_code}",
"http://localhost:8000/v1/completions", "-H",
"Content-Type: application/json", "-d",
'{"model":"TinyLlama-1.1B-Chat-v1.0","prompt":"NVIDIA is a great company because","max_tokens":16,"temperature":0}'
],
capture_output=True,
text=True,
timeout=30)

*body_lines, status_line = result.stdout.strip().splitlines()
body = "\n".join(body_lines)
status = int(status_line)

print("HTTP status:", status)
print("Response body:", body)

assert result.returncode == 0, f"curl exit {result.returncode}"
assert status == 200, f"Expected 200, got {status}"
finally:
proc.terminate()
try:
proc.wait(timeout=10)
except Exception:
proc.kill()

subprocess.run("ray stop --force", shell=True, check=False)
subprocess.run("pkill -9 -f trtllm-serve", shell=True, check=False)
ray.shutdown()
59 changes: 59 additions & 0 deletions tests/integration/defs/ray_orchestrator/RL/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# RL Framework Integration Tests

This directory contains integration tests for TensorRT-LLM with [Ray orchestrator](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/ray_orchestrator), specifically designed to cover usage patterns from various RL (Reinforcement Learning) frameworks such as VeRL and NeMo RL.

## Available Scripts

| Script | Description |
|--------|-------------|
| `run_rl_perf_reproduce.py` | Emulates RL workload performance with multiple AsyncLLM instances distributed across GPUs using Ray placement groups |

## Usage Examples

### RL Performance Reproduction

The `run_rl_perf_reproduce.py` script creates multiple TensorRT-LLM instances in parallel to simulate RL rollout workloads.

**TP=4 with 2 instances (8 GPUs total):**

```bash
python run_rl_perf_reproduce.py \
--model_dir /path/to/model_dir \
--data_path /path/to/prompts.json \
--num_instances 2 \
--tp_size 4 \
--top_p 1 \
--logprobs 1 \
--max_batch_size 1024 \
--enable_cuda_graph_padding
```

**TP=1 with 8 instances (8 GPUs total):**

```bash
python run_rl_perf_reproduce.py \
--model_dir /path/to/model_dir \
--data_path /path/to/prompts.json \
--num_instances 8 \
--tp_size 1 \
--top_p 1 \
--logprobs 1 \
--max_batch_size 384 \
--enable_cuda_graph_padding
```

## Data Format

The `--data_path` should point to a JSON file containing a list of prompts, where each prompt is a list of token IDs:

```json
[
[1, 2345, 6789, ...],
[1, 3456, 7890, ...],
...
]
```

## Notes

- RL Perf reproduction scripts support single-node execution only (max 8 GPUs)
Loading