Skip to content

Commit f4f0fe8

Browse files
authored
[TRTLLM-9737][chore] Add rl perf reproduce script and enhance the robustness of Ray tests (#9939)
Signed-off-by: Shuyi Xiong <[email protected]>
1 parent 534700e commit f4f0fe8

File tree

16 files changed

+755
-118
lines changed

16 files changed

+755
-118
lines changed

jenkins/L0_Test.groovy

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3054,6 +3054,7 @@ def launchTestJobs(pipeline, testFilter)
30543054
"DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
30553055
"DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
30563056
"DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
3057+
"DGX_H100-4_GPUs-PyTorch-Ray-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
30573058
"B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1],
30583059
"DGX_B200-4_GPUs-PyTorch-1": ["b200-x4", "l0_dgx_b200", 1, 1, 4],
30593060
"DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true],

tensorrt_llm/executor/ray_gpu_worker.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@ def setup_distributed_env_and_worker(self, port: int):
9191
store=self.store,
9292
world_size=self.world_size,
9393
rank=self.rank)
94+
assert torch.distributed.get_world_size(
95+
) == self.world_size, "Process group world size must match the expected world size"
9496
logger.info(
9597
f"[Rank {self.rank}] Finished PG init. Global GPU ID: {self.gpu}, local GPU ID: {self.local_gpu}"
9698
)

tests/integration/defs/conftest.py

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@
3737
import yaml
3838
from _pytest.mark import ParameterSet
3939

40-
from tensorrt_llm._utils import mpi_disabled
4140
from tensorrt_llm.bindings import ipc_nvls_supported
4241
from tensorrt_llm.llmapi.mpi_session import get_mpi_world_size
4342

@@ -2362,6 +2361,7 @@ def pytest_configure(config):
23622361
tqdm.tqdm.monitor_interval = 0
23632362
if config.getoption("--run-ray"):
23642363
os.environ["TLLM_DISABLE_MPI"] = "1"
2364+
os.environ["TLLM_RAY_FORCE_LOCAL_CLUSTER"] = "1"
23652365

23662366
# Initialize PeriodicJUnitXML reporter if enabled
23672367
periodic = config.getoption("--periodic-junit", default=False)
@@ -2825,15 +2825,3 @@ def torch_empty_cache() -> None:
28252825
gc.collect()
28262826
torch.cuda.empty_cache()
28272827
gc.collect()
2828-
2829-
2830-
@pytest.fixture(autouse=True)
2831-
def ray_cleanup(llm_venv) -> None:
2832-
yield
2833-
2834-
if mpi_disabled():
2835-
llm_venv.run_cmd([
2836-
"-m",
2837-
"ray.scripts.scripts",
2838-
"stop",
2839-
])

tests/integration/defs/disaggregated/test_disaggregated.py

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@
2121
from typing import Callable
2222

2323
import pytest
24+
25+
try:
26+
import ray
27+
except ImportError:
28+
import tensorrt_llm.ray_stub as ray
29+
2430
import yaml
2531
from defs.common import (revise_disagg_config_file_with_free_ports,
2632
wait_for_server)
@@ -30,7 +36,7 @@
3036
from test_common.perf_metrics_utils import (get_timing_metrics,
3137
validate_timing_metrics)
3238

33-
from tensorrt_llm._utils import get_free_port, mpi_disabled
39+
from tensorrt_llm._utils import mpi_disabled
3440
from tensorrt_llm.logger import logger
3541

3642

@@ -357,8 +363,6 @@ def run_disaggregated_test(example_dir,
357363

358364
extra_config_files = []
359365
workers_cmds = []
360-
subprocess.run(['ray', 'start', '--head', '--disable-usage-stats'],
361-
check=True)
362366

363367
# Generate ctx and gen server worker commands
364368
ctx_extra_config_file = get_extra_llm_config(config['context_servers'],
@@ -415,6 +419,21 @@ def run_disaggregated_test(example_dir,
415419
use_ray=False)
416420

417421
else:
422+
runtime_env = {
423+
"env_vars": {
424+
"RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES": "1"
425+
}
426+
}
427+
ray.init(address="local",
428+
include_dashboard=False,
429+
ignore_reinit_error=True,
430+
runtime_env=runtime_env)
431+
gcs_addr = ray.get_runtime_context().gcs_address
432+
ray_port = str(gcs_addr.split(":")[1])
433+
run_env.update({
434+
"RAY_ADDRESS": f"localhost:{ray_port}",
435+
"TLLM_RAY_FORCE_LOCAL_CLUSTER": "0"
436+
})
418437
workers_proc = []
419438
with contextlib.ExitStack() as stack:
420439
workers_log = stack.enter_context(
@@ -470,16 +489,16 @@ def run_disaggregated_test(example_dir,
470489
logger.error(f.read())
471490
raise
472491
finally:
473-
if use_ray:
474-
subprocess.run(['ray', 'stop', '--force'], check=False)
475-
for extra_file in extra_config_files:
476-
if os.path.exists(extra_file):
477-
os.remove(extra_file)
478-
elif 'server_proc' in locals() and 'workers_proc' in locals():
492+
if 'server_proc' in locals() and 'workers_proc' in locals():
479493
server_proc.terminate()
480494
workers_proc.terminate()
481495
server_proc.wait()
482496
workers_proc.wait()
497+
if use_ray:
498+
ray.shutdown()
499+
for extra_file in extra_config_files:
500+
if os.path.exists(extra_file):
501+
os.remove(extra_file)
483502

484503

485504
@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],

tests/integration/defs/examples/test_ray.py

Lines changed: 58 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
11
import os
22
import subprocess
33

4+
try:
5+
import ray
6+
except ImportError:
7+
import tensorrt_llm.ray_stub as ray
8+
49
import pytest
510
from defs.common import venv_check_call, wait_for_server
611
from defs.conftest import get_device_count, llm_models_root
12+
from defs.trt_test_alternative import popen
713

814

915
@pytest.fixture(scope="module")
@@ -65,48 +71,57 @@ def test_ray_disaggregated_serving(ray_example_root, llm_venv, tp_size):
6571
disagg_dir = os.path.join(ray_example_root, "disaggregated")
6672
script_path = os.path.join(disagg_dir, "disagg_serving_local.sh")
6773
model_dir = f"{llm_models_root()}/llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
68-
subprocess.run("ray stop --force", shell=True, check=False)
69-
70-
proc = subprocess.Popen(
71-
[
72-
"bash", script_path, "--executor", "ray", "--model", model_dir,
73-
"--tp_size",
74-
str(tp_size)
75-
],
76-
cwd=disagg_dir,
77-
stdout=subprocess.PIPE,
78-
stderr=subprocess.PIPE,
79-
)
80-
try:
81-
assert wait_for_server("localhost", 8000, timeout_seconds=180), \
82-
"Disaggregated server failed to start within 3 minutes"
83-
84-
result = subprocess.run([
85-
"curl", "-sS", "-w", "\n%{http_code}",
86-
"http://localhost:8000/v1/completions", "-H",
87-
"Content-Type: application/json", "-d",
88-
'{"model":"TinyLlama-1.1B-Chat-v1.0","prompt":"NVIDIA is a great company because","max_tokens":16,"temperature":0}'
89-
],
90-
capture_output=True,
91-
text=True,
92-
timeout=30)
93-
94-
*body_lines, status_line = result.stdout.strip().splitlines()
95-
body = "\n".join(body_lines)
96-
status = int(status_line)
97-
98-
print("HTTP status:", status)
99-
print("Response body:", body)
100-
101-
assert result.returncode == 0, f"curl exit {result.returncode}"
102-
assert status == 200, f"Expected 200, got {status}"
10374

75+
try:
76+
runtime_env = {
77+
"env_vars": {
78+
"RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES": "1"
79+
}
80+
}
81+
ray.init(address="local",
82+
include_dashboard=False,
83+
ignore_reinit_error=True,
84+
runtime_env=runtime_env)
85+
gcs_addr = ray.get_runtime_context().gcs_address
86+
ray_port = str(gcs_addr.split(":")[1])
87+
88+
env_copy = os.environ.copy()
89+
env_copy.update({
90+
"RAY_ADDRESS": f"localhost:{ray_port}",
91+
"TLLM_RAY_FORCE_LOCAL_CLUSTER": "0"
92+
})
93+
with popen(
94+
[
95+
"bash", script_path, "--executor", "ray", "--attach", "--model",
96+
model_dir, "--tp_size",
97+
str(tp_size)
98+
],
99+
cwd=disagg_dir,
100+
stdout=subprocess.PIPE,
101+
stderr=subprocess.PIPE,
102+
env=env_copy,
103+
):
104+
assert wait_for_server("localhost", 8000, timeout_seconds=180), \
105+
"Disaggregated server failed to start within 3 minutes"
106+
107+
result = subprocess.run([
108+
"curl", "-sS", "-w", "\n%{http_code}",
109+
"http://localhost:8000/v1/completions", "-H",
110+
"Content-Type: application/json", "-d",
111+
'{"model":"TinyLlama-1.1B-Chat-v1.0","prompt":"NVIDIA is a great company because","max_tokens":16,"temperature":0}'
112+
],
113+
capture_output=True,
114+
text=True,
115+
timeout=30)
116+
117+
*body_lines, status_line = result.stdout.strip().splitlines()
118+
body = "\n".join(body_lines)
119+
status = int(status_line)
120+
121+
print("HTTP status:", status)
122+
print("Response body:", body)
123+
124+
assert result.returncode == 0, f"curl exit {result.returncode}"
125+
assert status == 200, f"Expected 200, got {status}"
104126
finally:
105-
proc.terminate()
106-
try:
107-
proc.wait(timeout=10)
108-
except Exception:
109-
proc.kill()
110-
111-
subprocess.run("ray stop --force", shell=True, check=False)
112-
subprocess.run("pkill -9 -f trtllm-serve", shell=True, check=False)
127+
ray.shutdown()
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# RL Framework Integration Tests
2+
3+
This directory contains integration tests for TensorRT-LLM with [Ray orchestrator](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/ray_orchestrator), specifically designed to cover usage patterns from various RL (Reinforcement Learning) frameworks such as VeRL and NeMo RL.
4+
5+
## Available Scripts
6+
7+
| Script | Description |
8+
|--------|-------------|
9+
| `run_rl_perf_reproduce.py` | Emulates RL workload performance with multiple AsyncLLM instances distributed across GPUs using Ray placement groups |
10+
11+
## Usage Examples
12+
13+
### RL Performance Reproduction
14+
15+
The `run_rl_perf_reproduce.py` script creates multiple TensorRT-LLM instances in parallel to simulate RL rollout workloads.
16+
17+
**TP=4 with 2 instances (8 GPUs total):**
18+
19+
```bash
20+
python run_rl_perf_reproduce.py \
21+
--model_dir /path/to/model_dir \
22+
--data_path /path/to/prompts.json \
23+
--num_instances 2 \
24+
--tp_size 4 \
25+
--top_p 1 \
26+
--logprobs 1 \
27+
--max_batch_size 1024 \
28+
--enable_cuda_graph_padding
29+
```
30+
31+
**TP=1 with 8 instances (8 GPUs total):**
32+
33+
```bash
34+
python run_rl_perf_reproduce.py \
35+
--model_dir /path/to/model_dir \
36+
--data_path /path/to/prompts.json \
37+
--num_instances 8 \
38+
--tp_size 1 \
39+
--top_p 1 \
40+
--logprobs 1 \
41+
--max_batch_size 384 \
42+
--enable_cuda_graph_padding
43+
```
44+
45+
## Data Format
46+
47+
The `--data_path` should point to a JSON file containing a list of prompts, where each prompt is a list of token IDs:
48+
49+
```json
50+
[
51+
[1, 2345, 6789, ...],
52+
[1, 3456, 7890, ...],
53+
...
54+
]
55+
```
56+
57+
## Notes
58+
59+
- RL Perf reproduction scripts support single-node execution only (max 8 GPUs)

0 commit comments

Comments
 (0)