From 62398b29ddde2a7f871104e796ca4a0b4f35041a Mon Sep 17 00:00:00 2001 From: yingguo-trt <244492186+yingguo-trt@users.noreply.github.com> Date: Wed, 7 Jan 2026 11:29:16 +0800 Subject: [PATCH 1/4] update deepseekv32 setting yaml Signed-off-by: yingguo-trt <244492186+yingguo-trt@users.noreply.github.com> --- ...gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml | 24 ++++++++++++------- ...en1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml | 18 +++++++++----- ...1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml | 18 +++++++++----- ..._dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml | 18 +++++++++----- ...gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml | 18 +++++++++----- ...gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml | 18 +++++++++----- 6 files changed, 75 insertions(+), 39 deletions(-) diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml index 4dd4d7fb462..a1ef9c042df 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml @@ -43,13 +43,12 @@ environment: profiling: nsys_on: false accuracy: - enable_accuracy_test: false + enable_accuracy_test: false # Set to true to enable accuracy evaluation model: local-completions tasks: gsm8k - model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 worker_config: gen: - enable_layerwise_nvtx_marker: true tensor_parallel_size: 32 moe_expert_parallel_size: 32 enable_attention_dp: true @@ -79,18 +78,22 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.9 dtype: fp8 + tokens_per_block: 64 moe_config: - backend: WIDEEP - load_balancer: - num_slots: 288 - layer_updates_per_iter: 1 + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core cache_transceiver_config: max_tokens_in_buffer: 4608 backend: NIXL stream_interval: 20 num_postprocess_workers: 4 ctx: - enable_layerwise_nvtx_marker: true max_batch_size: 4 max_num_tokens: 4608 max_seq_len: 2251 @@ -101,10 +104,13 @@ worker_config: print_iter_log: true cuda_graph_config: null disable_overlap_scheduler: true + moe_config: + backend: TRTLLM kv_cache_config: enable_block_reuse: false free_gpu_memory_fraction: 0.85 dtype: fp8 + tokens_per_block: 64 cache_transceiver_config: max_tokens_in_buffer: 4608 - backend: NIXL + backend: NIXL \ No newline at end of file diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml index ca80042c69e..f6c746a5fd1 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml @@ -49,7 +49,6 @@ accuracy: model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 worker_config: gen: - enable_layerwise_nvtx_marker: true tensor_parallel_size: 16 moe_expert_parallel_size: 16 enable_attention_dp: true @@ -79,11 +78,16 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.9 dtype: fp8 + tokens_per_block: 64 moe_config: - backend: WIDEEP - load_balancer: - num_slots: 288 - layer_updates_per_iter: 1 + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core cache_transceiver_config: max_tokens_in_buffer: 4608 backend: NIXL @@ -93,7 +97,6 @@ worker_config: decoding_type: MTP num_nextn_predict_layers: 3 ctx: - enable_layerwise_nvtx_marker: true max_batch_size: 4 max_num_tokens: 4608 max_seq_len: 2251 @@ -104,10 +107,13 @@ worker_config: print_iter_log: true cuda_graph_config: null disable_overlap_scheduler: true + moe_config: + backend: TRTLLM kv_cache_config: enable_block_reuse: false free_gpu_memory_fraction: 0.85 dtype: fp8 + tokens_per_block: 64 cache_transceiver_config: max_tokens_in_buffer: 4608 backend: NIXL diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml index 76f4f78276c..d7076874138 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml @@ -50,7 +50,6 @@ accuracy: model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 worker_config: gen: - enable_layerwise_nvtx_marker: true tensor_parallel_size: 48 moe_expert_parallel_size: 48 enable_attention_dp: true @@ -80,17 +79,21 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.7 dtype: fp8 + tokens_per_block: 64 moe_config: - backend: WIDEEP - load_balancer: - num_slots: 288 - layer_updates_per_iter: 1 + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core cache_transceiver_config: max_tokens_in_buffer: 8320 backend: DEFAULT stream_interval: 20 ctx: - enable_layerwise_nvtx_marker: true max_batch_size: 4 max_num_tokens: 4480 max_seq_len: 2176 @@ -101,10 +104,13 @@ worker_config: print_iter_log: true cuda_graph_config: null disable_overlap_scheduler: true + moe_config: + backend: TRTLLM kv_cache_config: enable_block_reuse: false free_gpu_memory_fraction: 0.85 dtype: fp8 + tokens_per_block: 64 cache_transceiver_config: max_tokens_in_buffer: 8320 backend: DEFAULT diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml index 4a91160a99b..2e546620a35 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml @@ -49,7 +49,6 @@ accuracy: model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 worker_config: gen: - enable_layerwise_nvtx_marker: true tensor_parallel_size: 32 moe_expert_parallel_size: 32 enable_attention_dp: true @@ -80,11 +79,16 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.6 dtype: fp8 + tokens_per_block: 64 moe_config: - backend: WIDEEP - load_balancer: - num_slots: 288 - layer_updates_per_iter: 1 + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core cache_transceiver_config: max_tokens_in_buffer: 8448 backend: DEFAULT @@ -94,7 +98,6 @@ worker_config: decoding_type: MTP num_nextn_predict_layers: 3 ctx: - enable_layerwise_nvtx_marker: true max_batch_size: 1 max_num_tokens: 8448 max_seq_len: 9423 @@ -109,6 +112,9 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.75 dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 8448 backend: DEFAULT diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml index c262e3f6610..ae2a64877b0 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml @@ -49,7 +49,6 @@ accuracy: model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 worker_config: gen: - enable_layerwise_nvtx_marker: true tensor_parallel_size: 16 moe_expert_parallel_size: 16 enable_attention_dp: true @@ -79,18 +78,22 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.7 dtype: fp8 + tokens_per_block: 64 moe_config: - backend: WIDEEP - load_balancer: - num_slots: 288 - layer_updates_per_iter: 1 + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core cache_transceiver_config: max_tokens_in_buffer: 8448 backend: NIXL stream_interval: 20 num_postprocess_workers: 4 ctx: - enable_layerwise_nvtx_marker: true max_batch_size: 1 max_num_tokens: 8448 max_seq_len: 9419 @@ -105,6 +108,9 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.75 dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 8448 backend: NIXL diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml index 0b7bc63e3f0..ce7fb3aedaa 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml @@ -49,7 +49,6 @@ accuracy: model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 worker_config: gen: - enable_layerwise_nvtx_marker: true tensor_parallel_size: 32 moe_expert_parallel_size: 32 enable_attention_dp: true @@ -79,11 +78,16 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.7 dtype: fp8 + tokens_per_block: 64 moe_config: - backend: WIDEEP - load_balancer: - num_slots: 288 - layer_updates_per_iter: 1 + backend: CUTEDSL + use_low_precision_moe_combine: true + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core cache_transceiver_config: max_tokens_in_buffer: 8448 backend: NIXL @@ -93,7 +97,6 @@ worker_config: decoding_type: MTP num_nextn_predict_layers: 3 ctx: - enable_layerwise_nvtx_marker: true max_batch_size: 1 max_num_tokens: 8448 max_seq_len: 9419 @@ -108,6 +111,9 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.75 dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 8448 backend: NIXL From 663b45c7ef7b9cfcd8e03d01dbcdc03fe5401d5f Mon Sep 17 00:00:00 2001 From: yingguo-trt <244492186+yingguo-trt@users.noreply.github.com> Date: Wed, 7 Jan 2026 11:31:41 +0800 Subject: [PATCH 2/4] update deepseekv32 setting yaml Signed-off-by: yingguo-trt <244492186+yingguo-trt@users.noreply.github.com> --- ...v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml index a1ef9c042df..c0d8001da34 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml @@ -46,7 +46,7 @@ accuracy: enable_accuracy_test: false # Set to true to enable accuracy evaluation model: local-completions tasks: gsm8k - model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 worker_config: gen: tensor_parallel_size: 32 From 750bc6967a51ed2b189d0c0fc3b0a7ea1ea36eeb Mon Sep 17 00:00:00 2001 From: yingguo-trt <244492186+yingguo-trt@users.noreply.github.com> Date: Fri, 9 Jan 2026 10:02:04 +0800 Subject: [PATCH 3/4] fix pre-commit error Signed-off-by: yingguo-trt <244492186+yingguo-trt@users.noreply.github.com> --- ...v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml index c0d8001da34..8e8420735e3 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml @@ -113,4 +113,4 @@ worker_config: tokens_per_block: 64 cache_transceiver_config: max_tokens_in_buffer: 4608 - backend: NIXL \ No newline at end of file + backend: NIXL From c395709103fff747588c0b9ae42c166d9b57eb4c Mon Sep 17 00:00:00 2001 From: yingguo-trt <244492186+yingguo-trt@users.noreply.github.com> Date: Fri, 9 Jan 2026 12:19:49 +0800 Subject: [PATCH 4/4] remove tokens_per_block parameter Signed-off-by: yingguo-trt <244492186+yingguo-trt@users.noreply.github.com> --- ...2-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml | 4 +--- ...-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml | 2 -- ...p4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml | 2 -- ...4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml | 2 -- ...2-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml | 2 -- ...2-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml | 2 -- 6 files changed, 1 insertion(+), 13 deletions(-) diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml index 8e8420735e3..460a48e8e20 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml @@ -78,7 +78,6 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.9 dtype: fp8 - tokens_per_block: 64 moe_config: backend: CUTEDSL use_low_precision_moe_combine: true @@ -90,7 +89,7 @@ worker_config: - cuda_core cache_transceiver_config: max_tokens_in_buffer: 4608 - backend: NIXL + backend: NIXLf stream_interval: 20 num_postprocess_workers: 4 ctx: @@ -110,7 +109,6 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.85 dtype: fp8 - tokens_per_block: 64 cache_transceiver_config: max_tokens_in_buffer: 4608 backend: NIXL diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml index f6c746a5fd1..dff8eec4d97 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml @@ -78,7 +78,6 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.9 dtype: fp8 - tokens_per_block: 64 moe_config: backend: CUTEDSL use_low_precision_moe_combine: true @@ -113,7 +112,6 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.85 dtype: fp8 - tokens_per_block: 64 cache_transceiver_config: max_tokens_in_buffer: 4608 backend: NIXL diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml index d7076874138..f3fe861df06 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml @@ -79,7 +79,6 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.7 dtype: fp8 - tokens_per_block: 64 moe_config: backend: CUTEDSL use_low_precision_moe_combine: true @@ -110,7 +109,6 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.85 dtype: fp8 - tokens_per_block: 64 cache_transceiver_config: max_tokens_in_buffer: 8320 backend: DEFAULT diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml index 2e546620a35..f24b1a5e9e0 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml @@ -79,7 +79,6 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.6 dtype: fp8 - tokens_per_block: 64 moe_config: backend: CUTEDSL use_low_precision_moe_combine: true @@ -112,7 +111,6 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.75 dtype: fp8 - tokens_per_block: 64 moe_config: backend: TRTLLM cache_transceiver_config: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml index ae2a64877b0..2516e69a98b 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml @@ -78,7 +78,6 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.7 dtype: fp8 - tokens_per_block: 64 moe_config: backend: CUTEDSL use_low_precision_moe_combine: true @@ -108,7 +107,6 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.75 dtype: fp8 - tokens_per_block: 64 moe_config: backend: TRTLLM cache_transceiver_config: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml index ce7fb3aedaa..705cc33ea7f 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml @@ -78,7 +78,6 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.7 dtype: fp8 - tokens_per_block: 64 moe_config: backend: CUTEDSL use_low_precision_moe_combine: true @@ -111,7 +110,6 @@ worker_config: enable_block_reuse: false free_gpu_memory_fraction: 0.75 dtype: fp8 - tokens_per_block: 64 moe_config: backend: TRTLLM cache_transceiver_config: