Skip to content

Commit db2a42f

Browse files
kaiyuxzerollzeng
andauthored
[None][chore] Add sample yaml for wide-ep example and minor fixes (#8825)
Signed-off-by: Zero Zeng <[email protected]> Signed-off-by: Kaiyu Xie <[email protected]> Co-authored-by: Zero Zeng <[email protected]>
1 parent 89336fb commit db2a42f

File tree

5 files changed

+142
-24
lines changed

5 files changed

+142
-24
lines changed

examples/disaggregated/slurm/benchmark/config.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ slurm:
55
account: "<account>"
66
job_time: "02:00:00"
77
job_name: "<job_name>"
8-
numa_bind: true
8+
numa_bind: true # Only enable for GB200 NVL72
99

1010
# Benchmark Mode
1111
benchmark:
@@ -42,7 +42,6 @@ profiling:
4242
nsys_on: false # Set to true to enable profiling
4343

4444
worker_config:
45-
eplb_num_slots: 0 # Number of slots for EPLB
4645
gen:
4746
tensor_parallel_size: 8
4847
moe_expert_parallel_size: 8
@@ -77,6 +76,8 @@ worker_config:
7776
moe_config:
7877
backend: CUTLASS
7978
use_low_precision_moe_combine: true
79+
load_balancer:
80+
num_slots: 0
8081
cache_transceiver_config:
8182
max_tokens_in_buffer: 4608
8283
backend: DEFAULT

examples/disaggregated/slurm/benchmark/disaggr_torch.slurm

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ gpus_per_node=${1}
77
numa_bind=${2}
88
ctx_nodes=${3} # Number of nodes needed for ctx workers
99
gen_nodes=${4} # Number of nodes needed for gen workers
10-
ctx_tp_size=${5} # Tensor parallel size for ctx workers
11-
gen_tp_size=${6} # Tensor parallel size for gen workers
10+
ctx_world_size=${5} # World size for ctx workers
11+
gen_world_size=${6} # World size for gen workers
1212

1313
# Worker configuration
1414
num_ctx_servers=${7}
@@ -47,8 +47,8 @@ echo " gpus_per_node: ${gpus_per_node}"
4747
echo " numa_bind: ${numa_bind}"
4848
echo " ctx_nodes: ${ctx_nodes}"
4949
echo " gen_nodes: ${gen_nodes}"
50-
echo " ctx_tp_size: ${ctx_tp_size}"
51-
echo " gen_tp_size: ${gen_tp_size}"
50+
echo " ctx_world_size: ${ctx_world_size}"
51+
echo " gen_world_size: ${gen_world_size}"
5252
echo
5353
echo "Worker Configuration:"
5454
echo " num_ctx_servers: ${num_ctx_servers}"
@@ -123,7 +123,7 @@ if [ -d "${trtllm_repo}" ]; then
123123

124124
echo "Installing TensorRT-LLM..."
125125
if ! srun --container-name=${container_name} \
126-
--container-mounts=${container_mount} \
126+
--container-mounts=${container_mount} --no-container-mount-home \
127127
--mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \
128128
bash -c "cd ${trtllm_repo} && pip install -e ." \
129129
&> ${full_logdir}/install.log; then
@@ -167,7 +167,7 @@ echo "ctx_nodes_num_in_single_server: ${ctx_nodes_num_in_single_server}"
167167
echo "Starting gen workers..."
168168
for i in $(seq 0 $((num_gen_servers - 1))); do
169169
srun -l -N ${gen_nodes_num_in_single_server} \
170-
--ntasks=${gen_tp_size} \
170+
--ntasks=$((gen_world_size)) \
171171
--ntasks-per-node=${gpus_per_node} \
172172
--container-image=${container_image} \
173173
--container-name=${container_name} \
@@ -182,7 +182,7 @@ done
182182
echo "Starting ctx workers..."
183183
for i in $(seq 0 $((num_ctx_servers - 1))); do
184184
srun -l -N ${ctx_nodes_num_in_single_server} \
185-
--ntasks=${ctx_tp_size} \
185+
--ntasks=$((ctx_world_size)) \
186186
--ntasks-per-node=${gpus_per_node} \
187187
--container-image=${container_image} \
188188
--container-name=${container_name} \

examples/disaggregated/slurm/benchmark/submit.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,9 @@ def save_worker_config(config, output_path, worker_type):
3939
yaml.dump(worker_config, f, default_flow_style=False)
4040

4141

42-
def calculate_nodes(tp_size, num_servers, gpus_per_node):
43-
"""Calculate required nodes based on tensor parallel size and server count."""
44-
return (tp_size + gpus_per_node - 1) // gpus_per_node * num_servers
42+
def calculate_nodes(world_size, num_servers, gpus_per_node):
43+
"""Calculate required nodes based on world size and server count."""
44+
return (world_size + gpus_per_node - 1) // gpus_per_node * num_servers
4545

4646

4747
def submit_job(config):
@@ -50,10 +50,6 @@ def submit_job(config):
5050
hw_config = config['hardware']
5151
env_config = config['environment']
5252

53-
# Calculate nodes based on tensor parallel sizes
54-
ctx_tp_size = config['worker_config']['ctx']['tensor_parallel_size']
55-
gen_tp_size = config['worker_config']['gen']['tensor_parallel_size']
56-
5753
# Get number of servers from config
5854
ctx_num = hw_config['num_ctx_servers']
5955
gen_num = hw_config['num_gen_servers']
@@ -63,9 +59,16 @@ def submit_job(config):
6359
mtp_size = gen_config.get('speculative_config',
6460
{}).get('num_nextn_predict_layers', 0)
6561

66-
ctx_nodes = calculate_nodes(ctx_tp_size, ctx_num,
62+
# Calculate nodes based on world sizes
63+
ctx_tp_size = config['worker_config']['ctx']['tensor_parallel_size']
64+
ctx_pp_size = config['worker_config']['ctx']['pipeline_parallel_size']
65+
ctx_world_size = ctx_tp_size * ctx_pp_size
66+
ctx_nodes = calculate_nodes(ctx_world_size, ctx_num,
6767
hw_config['gpus_per_node'])
68-
gen_nodes = calculate_nodes(gen_tp_size, gen_num,
68+
gen_tp_size = config['worker_config']['gen']['tensor_parallel_size']
69+
gen_pp_size = config['worker_config']['gen']['pipeline_parallel_size']
70+
gen_world_size = gen_tp_size * gen_pp_size
71+
gen_nodes = calculate_nodes(gen_world_size, gen_num,
6972
hw_config['gpus_per_node'])
7073
total_nodes = ctx_nodes + gen_nodes
7174
total_tasks = total_nodes * hw_config['gpus_per_node']
@@ -82,9 +85,9 @@ def submit_job(config):
8285

8386
# Determine directory suffix based on attention_dp
8487
if gen_enable_attention_dp:
85-
dir_suffix = f"ctx{ctx_num}_gen{gen_num}_dep{gen_tp_size}_batch{gen_batch_size}_eplb{config['worker_config']['eplb_num_slots']}_mtp{mtp_size}"
88+
dir_suffix = f"ctx{ctx_num}_gen{gen_num}_dep{gen_tp_size}_batch{gen_batch_size}_eplb{config['worker_config']['gen']['moe_config']['load_balancer']['num_slots']}_mtp{mtp_size}"
8689
else:
87-
dir_suffix = f"ctx{ctx_num}_gen{gen_num}_tep{gen_tp_size}_batch{gen_batch_size}_eplb{config['worker_config']['eplb_num_slots']}_mtp{mtp_size}"
90+
dir_suffix = f"ctx{ctx_num}_gen{gen_num}_tep{gen_tp_size}_batch{gen_batch_size}_eplb{config['worker_config']['gen']['moe_config']['load_balancer']['num_slots']}_mtp{mtp_size}"
8891

8992
# Create full log directory path
9093
log_dir = os.path.join(log_base, dir_suffix)
@@ -114,8 +117,8 @@ def submit_job(config):
114117
str(slurm_config['numa_bind']).lower(),
115118
str(ctx_nodes), # Number of nodes needed for ctx workers
116119
str(gen_nodes), # Number of nodes needed for gen workers
117-
str(ctx_tp_size), # Tensor parallel size for ctx workers
118-
str(gen_tp_size), # Tensor parallel size for gen workers
120+
str(ctx_world_size), # World size for ctx workers
121+
str(gen_world_size), # World size for gen workers
119122

120123
# Worker configuration
121124
str(ctx_num),

examples/wide_ep/slurm_scripts/README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ Before running the scripts, ensure you have:
3434
### Run Benchmarks
3535

3636
```bash
37-
# Please find the `submit.py` script and an example `config.yaml` in the `examples/disaggregated/slurm/benchmark/` directory.
38-
python3 submit.py -c your_config.yaml
37+
# Please find the `submit.py` script in the `examples/disaggregated/slurm/benchmark/` directory.
38+
# An example `config.yaml` for wide EP: `examples/wide_ep/slurm_scripts/config.yaml`.
39+
python3 submit.py -c config.yaml
3940
```
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
# SLURM Configuration
2+
slurm:
3+
script_file: "disaggr_torch.slurm"
4+
partition: "<partition>"
5+
account: "<account>"
6+
job_time: "02:00:00"
7+
job_name: "<job_name>"
8+
numa_bind: true # Only enable for GB200 NVL72
9+
10+
# Hardware Configuration
11+
hardware:
12+
gpus_per_node: 4 # Modify this with your hardware configuration
13+
num_ctx_servers: 2 # Number of context servers
14+
num_gen_servers: 1 # Number of generation servers
15+
16+
# Benchmark Mode
17+
benchmark:
18+
mode: "e2e" # Options: e2e, gen_only
19+
use_nv_sa_benchmark: false # Whether to use NVIDIA SA benchmark script
20+
multi_round: 1 # Number of benchmark rounds
21+
benchmark_ratio: 0.8 # Benchmark ratio
22+
streaming: true # Enable streaming mode
23+
concurrency_list: "1024"
24+
25+
# Sequence Configuration
26+
sequence:
27+
input_length: 8196 # Input sequence length
28+
output_length: 1024 # Output sequence length
29+
30+
# Environment Configuration
31+
environment:
32+
container_mount: "<container_mount>" # Format: path1:path1,path2:path2
33+
container_image: "<container_image>"
34+
model_path: "<model_path>"
35+
trtllm_repo: "<trtllm_repo>"
36+
build_wheel: false # Don't build the wheel when launching multiple jobs
37+
dataset_file: "<dataset_file>"
38+
work_dir: "<full_path_to_work_dir>"
39+
40+
# Profiling Configuration
41+
profiling:
42+
nsys_on: false # Set to true to enable profiling
43+
44+
# Worker Configuration
45+
worker_config:
46+
gen:
47+
enable_layerwise_nvtx_marker: true
48+
tensor_parallel_size: 32
49+
moe_expert_parallel_size: 32
50+
enable_attention_dp: true
51+
enable_lm_head_tp_in_adp: true
52+
pipeline_parallel_size: 1
53+
max_batch_size: 128
54+
max_num_tokens: 512
55+
max_seq_len: 9236
56+
cuda_graph_config:
57+
enable_padding: true
58+
batch_sizes:
59+
- 1
60+
- 2
61+
- 4
62+
- 8
63+
- 16
64+
- 32
65+
- 64
66+
- 128
67+
- 256
68+
- 512
69+
- 768
70+
- 1024
71+
- 2048
72+
- 128
73+
print_iter_log: true
74+
kv_cache_config:
75+
enable_block_reuse: false
76+
free_gpu_memory_fraction: 0.6
77+
dtype: fp8
78+
moe_config:
79+
backend: WIDEEP
80+
use_low_precision_moe_combine: true
81+
load_balancer:
82+
num_slots: 288
83+
layer_updates_per_iter: 1
84+
cache_transceiver_config:
85+
max_tokens_in_buffer: 8448
86+
backend: DEFAULT
87+
stream_interval: 20
88+
num_postprocess_workers: 4
89+
speculative_config:
90+
decoding_type: MTP
91+
num_nextn_predict_layers: 3
92+
ctx:
93+
enable_layerwise_nvtx_marker: true
94+
max_batch_size: 1
95+
max_num_tokens: 8448
96+
max_seq_len: 8212
97+
tensor_parallel_size: 4
98+
moe_expert_parallel_size: 4
99+
enable_attention_dp: true
100+
pipeline_parallel_size: 1
101+
print_iter_log: true
102+
cuda_graph_config: null
103+
disable_overlap_scheduler: true
104+
kv_cache_config:
105+
enable_block_reuse: false
106+
free_gpu_memory_fraction: 0.75
107+
dtype: fp8
108+
cache_transceiver_config:
109+
max_tokens_in_buffer: 8448
110+
backend: DEFAULT
111+
speculative_config:
112+
decoding_type: MTP
113+
num_nextn_predict_layers: 3

0 commit comments

Comments
 (0)