Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,12 @@ environment:
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
enable_accuracy_test: false # Set to true to enable accuracy evaluation
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true
tensor_parallel_size: 32
moe_expert_parallel_size: 32
enable_attention_dp: true
Expand Down Expand Up @@ -80,17 +79,20 @@ worker_config:
free_gpu_memory_fraction: 0.9
dtype: fp8
moe_config:
backend: WIDEEP
load_balancer:
num_slots: 288
layer_updates_per_iter: 1
backend: CUTEDSL
use_low_precision_moe_combine: true
nvfp4_gemm_config:
allowed_backends:
- cutlass
- cublaslt
- cutedsl
- cuda_core
cache_transceiver_config:
max_tokens_in_buffer: 4608
backend: NIXL
backend: NIXLf
stream_interval: 20
num_postprocess_workers: 4
ctx:
enable_layerwise_nvtx_marker: true
max_batch_size: 4
max_num_tokens: 4608
max_seq_len: 2251
Expand All @@ -101,6 +103,8 @@ worker_config:
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
moe_config:
backend: TRTLLM
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.85
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ accuracy:
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true
tensor_parallel_size: 16
moe_expert_parallel_size: 16
enable_attention_dp: true
Expand Down Expand Up @@ -80,10 +79,14 @@ worker_config:
free_gpu_memory_fraction: 0.9
dtype: fp8
moe_config:
backend: WIDEEP
load_balancer:
num_slots: 288
layer_updates_per_iter: 1
backend: CUTEDSL
use_low_precision_moe_combine: true
nvfp4_gemm_config:
allowed_backends:
- cutlass
- cublaslt
- cutedsl
- cuda_core
cache_transceiver_config:
max_tokens_in_buffer: 4608
backend: NIXL
Expand All @@ -93,7 +96,6 @@ worker_config:
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
enable_layerwise_nvtx_marker: true
max_batch_size: 4
max_num_tokens: 4608
max_seq_len: 2251
Expand All @@ -104,6 +106,8 @@ worker_config:
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
moe_config:
backend: TRTLLM
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.85
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ accuracy:
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true
tensor_parallel_size: 48
moe_expert_parallel_size: 48
enable_attention_dp: true
Expand Down Expand Up @@ -81,16 +80,19 @@ worker_config:
free_gpu_memory_fraction: 0.7
dtype: fp8
moe_config:
backend: WIDEEP
load_balancer:
num_slots: 288
layer_updates_per_iter: 1
backend: CUTEDSL
use_low_precision_moe_combine: true
nvfp4_gemm_config:
allowed_backends:
- cutlass
- cublaslt
- cutedsl
- cuda_core
cache_transceiver_config:
max_tokens_in_buffer: 8320
backend: DEFAULT
stream_interval: 20
ctx:
enable_layerwise_nvtx_marker: true
max_batch_size: 4
max_num_tokens: 4480
max_seq_len: 2176
Expand All @@ -101,6 +103,8 @@ worker_config:
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
moe_config:
backend: TRTLLM
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.85
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ accuracy:
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true
tensor_parallel_size: 32
moe_expert_parallel_size: 32
enable_attention_dp: true
Expand Down Expand Up @@ -81,10 +80,14 @@ worker_config:
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: WIDEEP
load_balancer:
num_slots: 288
layer_updates_per_iter: 1
backend: CUTEDSL
use_low_precision_moe_combine: true
nvfp4_gemm_config:
allowed_backends:
- cutlass
- cublaslt
- cutedsl
- cuda_core
cache_transceiver_config:
max_tokens_in_buffer: 8448
backend: DEFAULT
Expand All @@ -94,7 +97,6 @@ worker_config:
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
enable_layerwise_nvtx_marker: true
max_batch_size: 1
max_num_tokens: 8448
max_seq_len: 9423
Expand All @@ -109,6 +111,8 @@ worker_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.75
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 8448
backend: DEFAULT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ accuracy:
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true
tensor_parallel_size: 16
moe_expert_parallel_size: 16
enable_attention_dp: true
Expand Down Expand Up @@ -80,17 +79,20 @@ worker_config:
free_gpu_memory_fraction: 0.7
dtype: fp8
moe_config:
backend: WIDEEP
load_balancer:
num_slots: 288
layer_updates_per_iter: 1
backend: CUTEDSL
use_low_precision_moe_combine: true
nvfp4_gemm_config:
allowed_backends:
- cutlass
- cublaslt
- cutedsl
- cuda_core
cache_transceiver_config:
max_tokens_in_buffer: 8448
backend: NIXL
stream_interval: 20
num_postprocess_workers: 4
ctx:
enable_layerwise_nvtx_marker: true
max_batch_size: 1
max_num_tokens: 8448
max_seq_len: 9419
Expand All @@ -105,6 +107,8 @@ worker_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.75
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 8448
backend: NIXL
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ accuracy:
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true
tensor_parallel_size: 32
moe_expert_parallel_size: 32
enable_attention_dp: true
Expand Down Expand Up @@ -80,10 +79,14 @@ worker_config:
free_gpu_memory_fraction: 0.7
dtype: fp8
moe_config:
backend: WIDEEP
load_balancer:
num_slots: 288
layer_updates_per_iter: 1
backend: CUTEDSL
use_low_precision_moe_combine: true
nvfp4_gemm_config:
allowed_backends:
- cutlass
- cublaslt
- cutedsl
- cuda_core
cache_transceiver_config:
max_tokens_in_buffer: 8448
backend: NIXL
Expand All @@ -93,7 +96,6 @@ worker_config:
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
enable_layerwise_nvtx_marker: true
max_batch_size: 1
max_num_tokens: 8448
max_seq_len: 9419
Expand All @@ -108,6 +110,8 @@ worker_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.75
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 8448
backend: NIXL
Expand Down