Skip to content

Commit c566a8d

Browse files
authored
[None][fix] fix same pp disagg (NVIDIA#6730)
Signed-off-by: Chuang Zhu <[email protected]>
1 parent 767879e commit c566a8d

File tree

5 files changed

+38
-8
lines changed

5 files changed

+38
-8
lines changed

cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -846,16 +846,23 @@ void CacheFormatter::unformat(TransferSession& session)
846846
}
847847
int selfNumLayers = selfConfig.getModelConfig().mNbKvHeadsPerLayer.size();
848848
int selfPPSize = selfConfig.getParallelConfig().mPipelineParallelism;
849+
int destPPSize = destConfig.getParallelConfig().mPipelineParallelism;
850+
int destNumLayers = destConfig.getModelConfig().mNbKvHeadsPerLayer.size();
851+
852+
if (selfPPSize == destPPSize)
853+
{
854+
return true;
855+
}
849856
if (selfNumLayers % selfPPSize != 0)
850857
{
851-
TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers must be divisible by pipeline parallelism");
858+
TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers %d must be divisible by pipeline parallelism :%d",
859+
selfNumLayers, selfPPSize);
852860
return false;
853861
}
854-
int destNumLayers = destConfig.getModelConfig().mNbKvHeadsPerLayer.size();
855-
int destPPSize = destConfig.getParallelConfig().mPipelineParallelism;
856862
if (destNumLayers % destPPSize != 0)
857863
{
858-
TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers must be divisible by pipeline parallelism");
864+
TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers %d must be divisible by pipeline parallelism :%d ",
865+
destNumLayers, destPPSize);
859866
return false;
860867
}
861868
return true;

cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,28 @@ void MLACacheFormatter::unformat(TransferSession& session)
583583
return false;
584584
}
585585

586+
int selfNumLayers = selfConfig.getModelConfig().mNbKvHeadsPerLayer.size();
587+
int selfPPSize = selfConfig.getParallelConfig().mPipelineParallelism;
588+
int destPPSize = destConfig.getParallelConfig().mPipelineParallelism;
589+
int destNumLayers = destConfig.getModelConfig().mNbKvHeadsPerLayer.size();
590+
591+
if (selfPPSize == destPPSize)
592+
{
593+
return true;
594+
}
595+
if (selfNumLayers % selfPPSize != 0)
596+
{
597+
TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers %d must be divisible by pipeline parallelism :%d",
598+
selfNumLayers, selfPPSize);
599+
return false;
600+
}
601+
if (destNumLayers % destPPSize != 0)
602+
{
603+
TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers %d must be divisible by pipeline parallelism :%d ",
604+
destNumLayers, destPPSize);
605+
return false;
606+
}
607+
586608
return true;
587609
}
588610
} // namespace tensorrt_llm::batch_manager::kv_cache_manager

tests/integration/defs/disaggregated/test_disaggregated.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -663,13 +663,14 @@ def test_disaggregated_ctxtp2pp2_gentp2pp2(disaggregated_test_root, llm_venv,
663663

664664

665665
@pytest.mark.skip_less_device(8)
666-
@pytest.mark.parametrize("llama_model_root", ['llama-3.1-8b'], indirect=True)
666+
@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
667+
indirect=True)
667668
def test_disaggregated_ctxpp4_genpp4(disaggregated_test_root, llm_venv,
668669
disaggregated_example_root,
669670
llama_model_root):
670671
src_dst_dict = {
671672
llama_model_root:
672-
f"{llm_venv.get_working_directory()}/llama-3.1-models/Meta-Llama-3.1-8B",
673+
f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
673674
}
674675
for src, dst in src_dst_dict.items():
675676
if not os.path.islink(dst):

tests/integration/test_lists/qa/llm_function_full.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -678,7 +678,7 @@ disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[
678678
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-False-Qwen3-8B-FP8]
679679
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-True-Qwen3-8B-FP8]
680680
disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0]
681-
disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[llama-3.1-8b]
681+
disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[TinyLlama-1.1B-Chat-v1.0]
682682
disaggregated/test_workers.py::test_workers_conditional_disaggregation[TinyLlama-1.1B-Chat-v1.0]
683683
disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0]
684684
disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0]

tests/integration/test_lists/test-db/l0_dgx_h200.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ l0_dgx_h200:
3030
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=4]
3131
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4]
3232
- disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0]
33-
- disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[llama-3.1-8b]
33+
- disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[TinyLlama-1.1B-Chat-v1.0]
3434
- unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep1-disable_adp-enable_graph-tp8-trtllm-scout]
3535
- unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep4-enable_adp-enable_graph-tp8-trtllm-scout]
3636
- unittest/llmapi/test_llm_pytorch.py::test_nemotron_nas_lora

0 commit comments

Comments
 (0)