Skip to content
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -3023,6 +3023,7 @@ def launchTestJobs(pipeline, testFilter)
"DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-PyTorch-Ray-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1],
"DGX_B200-4_GPUs-PyTorch-1": ["b200-x4", "l0_dgx_b200", 1, 1, 4],
"DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true],
Expand Down
2 changes: 2 additions & 0 deletions tensorrt_llm/executor/ray_gpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ def setup_distributed_env_and_worker(self, port: int):
store=self.store,
world_size=self.world_size,
rank=self.rank)
assert torch.distributed.get_world_size(
) == self.world_size, "Process group world size must match the expected world size"
logger.info(
f"[Rank {self.rank}] Finished PG init. Global GPU ID: {self.gpu}, local GPU ID: {self.local_gpu}"
)
Expand Down
1 change: 1 addition & 0 deletions tests/integration/defs/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2362,6 +2362,7 @@ def pytest_configure(config):
tqdm.tqdm.monitor_interval = 0
if config.getoption("--run-ray"):
os.environ["TLLM_DISABLE_MPI"] = "1"
os.environ["TLLM_RAY_FORCE_LOCAL_CLUSTER"] = "1"

# Initialize PeriodicJUnitXML reporter if enabled
periodic = config.getoption("--periodic-junit", default=False)
Expand Down
13 changes: 10 additions & 3 deletions tests/integration/defs/disaggregated/test_disaggregated.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,16 @@

import pytest
import yaml
from defs.common import (revise_disagg_config_file_with_free_ports,
from defs.common import (get_free_port_in_ci,
revise_disagg_config_file_with_free_ports,
wait_for_server)
from defs.conftest import (get_sm_version, llm_models_root, skip_arm,
skip_no_hopper)
from defs.trt_test_alternative import check_call, check_output, popen
from test_common.perf_metrics_utils import (get_timing_metrics,
validate_timing_metrics)

from tensorrt_llm._utils import get_free_port, mpi_disabled
from tensorrt_llm._utils import mpi_disabled
from tensorrt_llm.logger import logger


Expand Down Expand Up @@ -357,8 +358,14 @@ def run_disaggregated_test(example_dir,

extra_config_files = []
workers_cmds = []
subprocess.run(['ray', 'start', '--head', '--disable-usage-stats'],
ray_port = get_free_port_in_ci()
subprocess.run([
'ray', 'start', '--head', '--port',
str(ray_port), '--disable-usage-stats'
],
check=True)
run_env["RAY_ADDRESS"] = f"localhost:{ray_port}"
run_env["TLLM_RAY_FORCE_LOCAL_CLUSTER"] = "0"

# Generate ctx and gen server worker commands
ctx_extra_config_file = get_extra_llm_config(config['context_servers'],
Expand Down
16 changes: 12 additions & 4 deletions tests/integration/defs/examples/test_ray.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import subprocess

import pytest
from defs.common import venv_check_call, wait_for_server
from defs.common import get_free_port_in_ci, venv_check_call, wait_for_server
from defs.conftest import get_device_count, llm_models_root


Expand Down Expand Up @@ -66,16 +66,24 @@ def test_ray_disaggregated_serving(ray_example_root, llm_venv, tp_size):
script_path = os.path.join(disagg_dir, "disagg_serving_local.sh")
model_dir = f"{llm_models_root()}/llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
subprocess.run("ray stop --force", shell=True, check=False)

ray_port = get_free_port_in_ci()
subprocess.run(f"ray start --head --port {ray_port} --disable-usage-stats",
shell=True,
check=False)

env_copy = os.environ.copy()
env_copy.update({"RAY_ADDRESS": f"localhost:{ray_port}"},
{"TLLM_RAY_FORCE_LOCAL_CLUSTER": "0"})
proc = subprocess.Popen(
[
"bash", script_path, "--executor", "ray", "--model", model_dir,
"--tp_size",
"bash", script_path, "--executor", "ray", "--attach", "--model",
model_dir, "--tp_size",
str(tp_size)
],
cwd=disagg_dir,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=env_copy,
)
try:
assert wait_for_server("localhost", 8000, timeout_seconds=180), \
Expand Down
59 changes: 59 additions & 0 deletions tests/integration/defs/ray_orchestrator/RL/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# RL Framework Integration Tests

This directory contains integration tests for TensorRT-LLM with [Ray orchestrator](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/ray_orchestrator), specifically designed to cover usage patterns from various RL (Reinforcement Learning) frameworks such as VeRL and NeMo RL.

## Available Scripts

| Script | Description |
|--------|-------------|
| `run_rl_perf_reproduce.py` | Emulates RL workload performance with multiple AsyncLLM instances distributed across GPUs using Ray placement groups |

## Usage Examples

### RL Performance Reproduction

The `run_rl_perf_reproduce.py` script creates multiple TensorRT-LLM instances in parallel to simulate RL rollout workloads.

**TP=4 with 2 instances (8 GPUs total):**

```bash
python run_rl_perf_reproduce.py \
--model_dir /path/to/model_dir \
--data_path /path/to/prompts.json \
--num_instances 2 \
--tp_size 4 \
--top_p 1 \
--logprobs 1 \
--max_batch_size 1024 \
--enable_cuda_graph_padding
```

**TP=1 with 8 instances (8 GPUs total):**

```bash
python run_rl_perf_reproduce.py \
--model_dir /path/to/model_dir \
--data_path /path/to/prompts.json \
--num_instances 8 \
--tp_size 1 \
--top_p 1 \
--logprobs 1 \
--max_batch_size 384 \
--enable_cuda_graph_padding
```

## Data Format

The `--data_path` should point to a JSON file containing a list of prompts, where each prompt is a list of token IDs:

```json
[
[1, 2345, 6789, ...],
[1, 3456, 7890, ...],
...
]
```

## Notes

- RL Perf reproduction scripts support single-node execution only (max 8 GPUs)
Loading