diff --git a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp index 7e4c26bfd78..78305873638 100644 --- a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp +++ b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp @@ -148,20 +148,19 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa if (mCacheState->getParallelConfig().mEnableAttentionDP) { - int TPSizeInDPGroup - = mCacheState->getParallelConfig().mTensorParallelism / mCacheState->getParallelConfig().mDPsize; int DPSize = mCacheState->getParallelConfig().mDPsize; - int TPRankInDPGroup = worldConfig.getTensorParallelRank() % TPSizeInDPGroup; - int DPRank = (worldConfig.getRank() - TPSizeInDPGroup * DPSize * worldConfig.getPipelineParallelRank() - - TPRankInDPGroup) - / TPSizeInDPGroup; - // + // DPRank is derived from the tensor parallel rank, which already accounts for CP. + // Layout: rank = ppRank * (TP * CP) + tpRank * CP + cpRank. + // getTensorParallelRank() correctly extracts tpRank regardless of CP. + int DPRank = mCacheState->getParallelConfig().mDPrank; + // mGroupDataComm = std::make_shared(mGroupComm->split(DPRank, worldConfig.getRank())); if (worldConfig.isTensorParallel()) { + // Group ranks with same (ppRank, DPRank) accounting for CP. mGroupTPInDPComm = std::make_shared( - mGroupComm->split(worldConfig.getRank() / TPSizeInDPGroup, worldConfig.getRank())); + mGroupComm->split(worldConfig.getPipelineParallelRank() * DPSize + DPRank, worldConfig.getRank())); } } bool isMLA = attentionType == executor::kv_cache::CacheState::AttentionType::kMLA; diff --git a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp index 5e7528b8dd4..76f8752e375 100644 --- a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp +++ b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp @@ -552,9 +552,10 @@ class AsymmetricalCacheTest : public ::testing::TestWithParamgetModelConfig().mTokensPerBlock; + + std::optional cpMetaData; + int seqLen = length; + if (mCpSize > 1) + { + cpMetaData.emplace(length, tokensPerBlock, mCpRank, mCpSize); + seqLen = cpMetaData.value().mSeqLenOnThisCPRank; + } + texec::Request request{VecTokens(seqLen, seqLen), maxNewTokens}; auto state = std::make_unique(); state->setCommState(texec::kv_cache::CommState{*mContextCommState}); @@ -905,7 +915,6 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam(requestId, std::move(request)); - std::optional cpMetaData; return std::make_unique(std::move(llmRequestPtr), cpMetaData); } @@ -1428,6 +1437,27 @@ TEST_P(AsymmetricalCacheTestWithDP, TestCase) { GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL and MOONCAKE backend for CP."; } + // Filter request lengths based on CP requirements. + // Each request must have at least one block per CP rank to be valid for CP tests. + std::vector lenList = {60, 30, 60, 10}; + if (genCp > 1) + { + std::vector updatedLenList; + for (auto len : lenList) + { + if (len > tokensPerBlock * (genCp - 1)) + { + updatedLenList.push_back(len); + } + } + if (updatedLenList.empty()) + { + GTEST_SKIP() << "Skipping test because not even one request has one block per genCP rank. tokensPerBlock=" + << tokensPerBlock << ", genCp=" << genCp; + } + lenList = updatedLenList; + } + setUpCommunicator(contextTp, contextPp, contextCp, genTp, genPp, genCp, isMLA, contextDP, generationDP); if (mIsContext || mIsGeneration) @@ -1438,7 +1468,7 @@ TEST_P(AsymmetricalCacheTestWithDP, TestCase) setUpCacheTransceiver(); std::vector> requests; int requestId = 0; - for (auto len : {60, 30, 60, 10}) + for (auto len : lenList) { requests.emplace_back(makeLlmRequestWithDP(len, requestId, requestId % contextTp)); requestId++; @@ -1814,6 +1844,44 @@ INSTANTIATE_TEST_CASE_P(AsymmetricCaseTest1WithCPForMLA, AsymmetricalCacheTest, /*generationDP*/ testing::Values(false), /*isWindow*/ testing::Values(false), testing::Values(false), testing::Values(0), testing::Values(128))); +// Tests cases where there's non-trivial TP and PP on context side while non-trivial CP & DP on gen side. +INSTANTIATE_TEST_CASE_P(AsymmetricCaseTestWithCPAndDPForMLA0, AsymmetricalCacheTestWithDP, + testing::Combine(/*contextTp*/ testing::Values(1, 2), + /*contextPp*/ testing::Values(1, 2), + /*contextCp*/ testing::Values(1), + /*genTp*/ testing::Values(2), + /*genPp*/ testing::Values(1), + /*genCp*/ testing::Values(2), + /*numLayers*/ testing::Values(4), + /*numHeads*/ testing::Values(1), + /*sizePerHead*/ testing::Values(4), + /*tokensPerBlock*/ testing::Values(8), + /*dataType*/ testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), + /*kvFactor*/ testing::Values(1), + /*isMLA*/ testing::Values(true), + /*contextDP*/ testing::Values(false), + /*generationDP*/ testing::Values(true), + /*isWindow*/ testing::Values(false), testing::Values(false), testing::Values(0), testing::Values(128))); + +// Tests cases where there's non-trivial DP on context side while non-trivial CP & DP on gen side. +INSTANTIATE_TEST_CASE_P(AsymmetricCaseTestWithCPAndDPForMLA1, AsymmetricalCacheTestWithDP, + testing::Combine(/*contextTp*/ testing::Values(2, 4), + /*contextPp*/ testing::Values(1), + /*contextCp*/ testing::Values(1), + /*genTp*/ testing::Values(2), + /*genPp*/ testing::Values(1), + /*genCp*/ testing::Values(2), + /*numLayers*/ testing::Values(4), + /*numHeads*/ testing::Values(1), + /*sizePerHead*/ testing::Values(4), + /*tokensPerBlock*/ testing::Values(8), + /*dataType*/ testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), + /*kvFactor*/ testing::Values(1), + /*isMLA*/ testing::Values(true), + /*contextDP*/ testing::Values(true), + /*generationDP*/ testing::Values(true), + /*isWindow*/ testing::Values(false), testing::Values(false), testing::Values(0), testing::Values(128))); + INSTANTIATE_TEST_CASE_P(AsymmetricCaseTestWithDPForMLA1, AsymmetricalCacheTestWithDP, testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(1), testing::Values(1, 2), testing::Values(1, 2), testing::Values(1), testing::Values(4), testing::Values(1), testing::Values(4), @@ -2226,8 +2294,8 @@ TEST(targetTest, CacheStateContextDP) auto const verifyContext = [&](int contextRank, int generationRank, std::vector const& expectRanks, int expectPPDomain, int expectTPDomain, bool expectNeedSend) { - int contextDPRank = contextRank % contextTP; - int generationDPRank = generationRank % genTP; + int contextDPRank = (contextRank % (contextTP * contextCP)) / contextCP; + int generationDPRank = (generationRank % (genTP * genCP)) / genCP; auto attentionType = isMLA ? texec::kv_cache::CacheState::AttentionType::kMLA : texec::kv_cache::CacheState::AttentionType::kDEFAULT; @@ -2239,12 +2307,12 @@ TEST(targetTest, CacheStateContextDP) tokensPerBlock, genTP, genPP, genCP, genAttentionLayerNumPerPP, dataType, attentionType, kvFactor, genEnableDP, generationDPRank, genTP}; - auto const contextTragetInfo + auto const contextTargetInfo = tensorrt_llm::executor::kv_cache::TargetRanksInfoForDP(genCache, contextCache, contextRank); - EXPECT_EQ(expectRanks, contextTragetInfo.mIRanks); - EXPECT_EQ(expectPPDomain, contextTragetInfo.mDomainPPSize); - EXPECT_EQ(expectTPDomain, contextTragetInfo.mDomainTPSize); + EXPECT_EQ(expectRanks, contextTargetInfo.mIRanks); + EXPECT_EQ(expectPPDomain, contextTargetInfo.mDomainPPSize); + EXPECT_EQ(expectTPDomain, contextTargetInfo.mDomainTPSize); EXPECT_EQ(expectNeedSend, MLACacheFormatter::needSendCache(contextCache, genCache, contextRank)); }; @@ -2330,11 +2398,11 @@ TEST(targetTest, CacheStateContextDP) contextTP = 1; genTP = 2; - auto const verfiyGeneration = [&](int contextRank, int generationRank, std::vector const& expectRanks, + auto const verifyGeneration = [&](int contextRank, int generationRank, std::vector const& expectRanks, int expectPPDomain, int expectTPDomain) { - int contextDPRank = contextRank % contextTP; - int generationDPRank = generationRank % genTP; + int contextDPRank = (contextRank % (contextTP * contextCP)) / contextCP; + int generationDPRank = (generationRank % (genTP * genCP)) / genCP; auto attentionType = isMLA ? texec::kv_cache::CacheState::AttentionType::kMLA : texec::kv_cache::CacheState::AttentionType::kDEFAULT; @@ -2346,17 +2414,17 @@ TEST(targetTest, CacheStateContextDP) tokensPerBlock, genTP, genPP, genCP, genAttentionLayerNumPerPP, dataType, attentionType, kvFactor, genEnableDP, generationDPRank, genTP}; - auto const contextTragetInfo + auto const contextTargetInfo = tensorrt_llm::executor::kv_cache::TargetRanksInfoForDP(contextCache, genCache, generationRank); - EXPECT_EQ(expectRanks, contextTragetInfo.mIRanks); - EXPECT_EQ(expectPPDomain, contextTragetInfo.mDomainPPSize); - EXPECT_EQ(expectTPDomain, contextTragetInfo.mDomainTPSize); + EXPECT_EQ(expectRanks, contextTargetInfo.mIRanks); + EXPECT_EQ(expectPPDomain, contextTargetInfo.mDomainPPSize); + EXPECT_EQ(expectTPDomain, contextTargetInfo.mDomainTPSize); }; - verfiyGeneration( + verifyGeneration( /*contextRank*/ 0, /*generationRank*/ 0, /*expectRanks*/ {0}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1); - verfiyGeneration( + verifyGeneration( /*contextRank*/ 0, /*generationRank*/ 1, /*expectRanks*/ {0}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1); contextTP = 1; @@ -2366,9 +2434,9 @@ TEST(targetTest, CacheStateContextDP) contextAttentionLayerNumPerPP = std::vector(contextPP, numLayers / contextPP); genAttentionLayerNumPerPP = std::vector(genPP, numLayers / genPP); - verfiyGeneration( + verifyGeneration( /*contextRank*/ 0, /*generationRank*/ 0, /*expectRanks*/ {0}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1); - verfiyGeneration( + verifyGeneration( /*contextRank*/ 0, /*generationRank*/ 1, /*expectRanks*/ {0}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1); genEnableDP = false; @@ -2381,8 +2449,8 @@ TEST(targetTest, CacheStateContextDP) contextAttentionLayerNumPerPP = std::vector(contextPP, numLayers / contextPP); genAttentionLayerNumPerPP = std::vector(genPP, numLayers / genPP); - verfiyGeneration( + verifyGeneration( /*contextRank*/ 0, /*generationRank*/ 0, /*expectRanks*/ {0}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1); - verfiyGeneration( + verifyGeneration( /*contextRank*/ 1, /*generationRank*/ 0, /*expectRanks*/ {1}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1); } diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py index 1f18270687c..c2c2708074d 100644 --- a/tensorrt_llm/_torch/autotuner.py +++ b/tensorrt_llm/_torch/autotuner.py @@ -1547,7 +1547,7 @@ def _maybe_sync_cache_data(self, strategy: DistributedTuningStrategy, def _merge_cache_data(self, custom_op: str): cache_data = self.profiling_cache.get_specific_custom_op(custom_op) merged_cache_data = dict() - all_cache_data = self._dist.tp_allgather(obj=cache_data) + all_cache_data = self._dist.tp_cp_allgather(obj=cache_data) for data in all_cache_data: for key, value in data.items(): diff --git a/tensorrt_llm/_torch/distributed/communicator.py b/tensorrt_llm/_torch/distributed/communicator.py index 09bbc234ee2..650385187fb 100644 --- a/tensorrt_llm/_torch/distributed/communicator.py +++ b/tensorrt_llm/_torch/distributed/communicator.py @@ -1,4 +1,3 @@ -import copy import math import pickle # nosec B403 from abc import ABC, abstractmethod @@ -136,6 +135,35 @@ def tp_cp_broadcast(self, obj, root=0, **kwargs): obj = self.cp_broadcast(obj, root=root, **kwargs) return obj + @abstractmethod + def tp_allgather(self, obj): + pass + + @abstractmethod + def cp_allgather(self, obj): + pass + + def tp_cp_allgather(self, obj): + """Allgather across both TP and CP dimensions. + + First gathers within CP group, then across TP groups, returning + a flattened list with tp_size * cp_size entries. + """ + # Gather across CP dimension. + if self.cp_size > 1: + obj = self.cp_allgather(obj) + else: + obj = [obj] # Wrap to match cp_allgather output format. + + # Gather across TP dimension. + if self.tp_size > 1: + obj = self.tp_allgather(obj) + else: + obj = [obj] # Wrap to match tp_allgather output format. + + # Flatten: [[cp0, cp1], [cp0, cp1], ...] -> [tp0_cp0, tp0_cp1, tp1_cp0, ...] + return [entry for tp_group in obj for entry in tp_group] + def safe_broadcast(comm, obj, root=0, chunk_size: int = 4 * 1024 * 1024): """ @@ -363,24 +391,9 @@ class MPIDist(Distributed): def __init__(self, mapping: Mapping): super().__init__(mapping) self.create_cp_comm() - # Repurpose CP ranks to TP for Helix so that the right comms are created. - mapping_with_cp = None - if self.mapping.has_cp_helix(): - logger.info( - f"[MPIDist::__init__] Repurposing CP ranks to TP for Helix.") - mapping_with_cp = copy.deepcopy(self.mapping) - self.mapping = self.mapping.repurpose_helix_cp_to_tp() - self.create_tp_comm() self.create_pp_comm() - # Restore the original mapping. - if mapping_with_cp is not None: - logger.info( - f"[MPIDist::__init__] Restoring original mapping undoing Helix manipulation." - ) - self.mapping = mapping_with_cp - def broadcast(self, obj, root=0, chunk_size: int = 4 * 1024 * 1024): comm = mpi_comm() return safe_broadcast(comm, obj, root=root, chunk_size=chunk_size) @@ -758,6 +771,22 @@ def cp_broadcast(self, obj, root=0, **kwargs): device=torch.device("cpu")) return ret[0] + @log_op + def cp_allgather(self, obj): + if isinstance(obj, torch.Tensor): + output_list = [ + torch.empty_like(obj) + for _ in range(self.mapping.cp_group_pg.size()) + ] + dist.all_gather(output_list, obj, group=self.mapping.cp_group_pg) + return output_list + else: + output_list = [None] * self.mapping.cp_group_pg.size() + dist.all_gather_object(output_list, + obj, + group=self.mapping.cp_group_pg) + return output_list + @log_op def pp_allgather(self, obj): if isinstance(obj, torch.Tensor): diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py index f475280f851..2474d416d64 100755 --- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py +++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py @@ -1120,13 +1120,19 @@ def __init__(self, reduce_output=not self.enable_attention_dp and self.mapping.tp_size > 1) else: + # When enable_attention_dp is True, TP reduction is skipped since each DP rank + # works on different batch elements. However, with CP > 1, attention is split + # across CP ranks for the SAME batch element, so reduction is still needed + # within the CP group. + needs_tp_reduce = not self.enable_attention_dp and self.mapping.tp_size > 1 + needs_cp_reduce = mapping_with_cp is not None and mapping_with_cp.has_cp_helix( + ) self.self_attn = DeepseekV3Attention( model_config, layer_idx=layer_idx_for_attention, aux_stream=aux_stream_dict[AuxStreamType.Attention], mapping_with_cp=mapping_with_cp, - reduce_output=not self.enable_attention_dp - and self.mapping.tp_size > 1) + reduce_output=needs_tp_reduce or needs_cp_reduce) self.fusion_config = EagerFusionConfig() self.enable_fusion = os.environ.get( @@ -1192,10 +1198,15 @@ def __init__(self, eps=config.rms_norm_eps, dtype=config.torch_dtype) + # When enable_attention_dp is True, we normally skip attention all-reduce since each + # DP rank works on different batch elements. However, with CP > 1, attention is split + # across CP ranks for the SAME batch element, so all-reduce is still needed. + has_cp = mapping_with_cp is not None and mapping_with_cp.cp_size > 1 + can_skip_for_attention_dp = self.enable_attention_dp and not has_cp self.disable_attn_allreduce = (self.fusion_config.PRE_MOE_FUSION or self.fusion_config.PRE_MLP_FUSION or self.mapping.tp_size == 1 - or self.enable_attention_dp) + or can_skip_for_attention_dp) self.post_attention_layernorm = RMSNorm(hidden_size=config.hidden_size, eps=config.rms_norm_eps, diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py index 69ae313713a..bd1b4274993 100644 --- a/tensorrt_llm/_torch/modules/attention.py +++ b/tensorrt_llm/_torch/modules/attention.py @@ -814,7 +814,9 @@ def __init__( tp_size = self.mapping.tp_size pp_size = self.mapping.pp_size cp_size = self.mapping.cp_size + dp_size = 1 if self.mapping.enable_attention_dp: + dp_size = tp_size tp_size = 1 if self.mapping.has_cp_ulysses(): raise NotImplementedError("MLA doesn't support CP Ulyssees yet") @@ -823,9 +825,9 @@ def __init__( ), f"CP type must be HELIX for MLA, but got {self.mapping.cp_config['cp_type']}." mapping = Mapping( - world_size=tp_size * pp_size * cp_size, + world_size=pp_size * dp_size * tp_size * cp_size, tp_size=tp_size, - pp_size=pp_size, + pp_size=pp_size * dp_size, cp_size=cp_size, cp_config=self.mapping.cp_config, rank=self.mapping.rank, @@ -924,9 +926,9 @@ def __init__( ) mapping_o = Mapping( - world_size=tp_size * pp_size * cp_size, + world_size=pp_size * dp_size * tp_size * cp_size, tp_size=tp_size * cp_size, - pp_size=pp_size, + pp_size=pp_size * dp_size, cp_size=1, rank=self.mapping.rank, gpus_per_node=self.mapping.gpus_per_node, diff --git a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py index 161282e4c4e..042ed823865 100644 --- a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py +++ b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py @@ -359,13 +359,22 @@ def _fetch_new_requests_attention_tp( def _fetch_new_requests_attention_dp( self, activate_requests: List[LlmRequest]) -> List[LlmRequest]: """Handle attention DP request fetching with load balancing.""" - # Get active request counts across all ranks + # Get active request counts across all ranks. all_ranks_num_active_requests = [] all_ranks_num_active_tokens = [] - num_active_tokens = sum( - [req.py_orig_prompt_len for req in activate_requests]) + + if self.dist.has_cp_helix: + num_active_tokens = sum( + [req.total_input_len_cp for req in activate_requests]) + else: + num_active_tokens = sum( + [req.py_orig_prompt_len for req in activate_requests]) + + # Note: We use tp_allgather even for CP assuming that all CP ranks with the + # same dp_rank have the same num_active_tokens and num_active_requests. responses_list = self.dist.tp_allgather( [len(activate_requests), num_active_tokens]) + for num_active_requests, num_active_tokens in responses_list: all_ranks_num_active_requests.append(num_active_requests) all_ranks_num_active_tokens.append(num_active_tokens) diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index f186da6cd89..b1a5f7bc5bb 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -1324,7 +1324,7 @@ def _postprocess_inputs(self, inputs: Dict[str, Any]): def _get_all_rank_num_tokens(self, attn_metadata: AttentionMetadata): if self.enable_attention_dp: - return list(self.dist.tp_allgather(attn_metadata.num_tokens)) + return list(self.dist.tp_cp_allgather(attn_metadata.num_tokens)) return None def _get_all_rank_ctx_requests(self, num_ctx_requests: int): @@ -1536,7 +1536,7 @@ def _prepare_incremental_update_metadata( # Handle distributed spec metadata if enable_attention_dp: sequence_lengths = spec_metadata.seq_lens - all_rank_num_tokens = self.dist.tp_allgather( + all_rank_num_tokens = self.dist.tp_cp_allgather( [spec_metadata.num_tokens, len(sequence_lengths)]) spec_metadata.all_rank_num_tokens = [ @@ -2691,7 +2691,7 @@ def previous_seq_slots_device(): inputs['spec_metadata'] = spec_metadata if self.enable_attention_dp: - all_rank_num_tokens = self.dist.tp_allgather( + all_rank_num_tokens = self.dist.tp_cp_allgather( [spec_metadata.num_tokens, len(sequence_lengths)]) @@ -2856,7 +2856,7 @@ def _prepare_tp_inputs_no_cache( # support attention dp if self.enable_attention_dp: if spec_metadata is not None: - all_rank_num_tokens = self.dist.tp_allgather([ + all_rank_num_tokens = self.dist.tp_cp_allgather([ attn_metadata.num_tokens, spec_metadata.num_tokens, len(sequence_lengths) ]) @@ -2871,7 +2871,7 @@ def _prepare_tp_inputs_no_cache( spec_metadata.all_rank_num_tokens = spec_all_rank_num_tokens spec_metadata.all_rank_num_seqs = all_rank_num_seqs else: - all_rank_num_tokens = self.dist.tp_allgather( + all_rank_num_tokens = self.dist.tp_cp_allgather( attn_metadata.num_tokens) attn_metadata.all_rank_num_tokens = all_rank_num_tokens diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index e8ff00754c1..cfa993215dd 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -1944,6 +1944,8 @@ def _balance_adp_requests(self, context_requests: list[LlmRequest], num_scheduled_tokens = sum( [len(req.get_tokens(0)) for req in context_requests]) + num_scheduled_generation_requests + # Note: We use tp_allgather instead of tp_cp_allgather because we want to + # balance the requests across DP ranks; not CP ranks within those DP ranks. responses_list = self.dist.tp_allgather([ num_scheduled_context_requests, num_scheduled_generation_requests, num_scheduled_tokens diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index 2ba2ee1bfee..384129f7ccc 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -871,10 +871,16 @@ def test_auto_dtype(self, overlap_scheduler, mtp_nextn): task = GSM8K(self.MODEL_NAME) task.evaluate(llm) + @skip_pre_blackwell @pytest.mark.skip_less_device(8) - @pytest.mark.parametrize("gen_pp,gen_tp,gen_cp", [(1, 1, 4), (1, 2, 2), - (2, 1, 2)], - ids=["pp1tp1cp4", "pp1tp2cp2", "pp2tp1cp2"]) + @pytest.mark.parametrize( + "gen_pp,gen_tp,gen_cp,enable_attention_dp", [ + (1, 1, 4, False), + (1, 2, 2, False), + (1, 2, 2, True), + (2, 1, 2, False), + ], + ids=["pp1tp1cp4", "pp1tp2cp2", "pp1dp2cp2", "pp2tp1cp2"]) @pytest.mark.parametrize("cuda_graph_config", [ None, { @@ -892,7 +898,7 @@ def test_auto_dtype(self, overlap_scheduler, mtp_nextn): ]) @pytest.mark.parametrize("comms_medium", ["fifo", "nccl"]) def test_auto_dtype_with_helix(self, comms_medium, cuda_graph_config, - gen_pp, gen_tp, gen_cp): + gen_pp, gen_tp, gen_cp, enable_attention_dp): use_nccl_for_alltoall = comms_medium == "nccl" gen_ep = gen_tp * gen_cp kv_cache_config = { @@ -932,6 +938,7 @@ def test_auto_dtype_with_helix(self, comms_medium, cuda_graph_config, "backend": "UCX", "max_tokens_in_buffer": 8192, }, + "enable_attention_dp": enable_attention_dp, } disaggregated_server_config = { "hostname": "localhost", diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index 09dc8b22677..2e0809eee67 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -279,11 +279,13 @@ accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1tp1cp4] accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp1cp4] accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp2tp1cp2] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:without_padding-pp2tp1cp2] accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp2tp1cp2] accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp2tp1cp2] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:without_padding-pp2tp1cp2] accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp2tp1cp2] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp1dp2cp2] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1dp2cp2] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1dp2cp2] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1dp2cp2] accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False] accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True] diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index eb7ab1bfc60..2f6c09b35ef 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -74,6 +74,7 @@ l0_dgx_b200: - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp2tp1cp2] TIMEOUT (60) - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp2cp2] TIMEOUT (60) - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp1cp4] TIMEOUT (60) + - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1dp2cp2] TIMEOUT (60) - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] TIMEOUT (60) - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] TIMEOUT (60) - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_bs8_mtp] TIMEOUT (60) @@ -104,6 +105,7 @@ l0_dgx_b200: - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp2tp1cp2] TIMEOUT (60) - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp2cp2] TIMEOUT (60) - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp1cp4] TIMEOUT (60) + - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1dp2cp2] TIMEOUT (60) - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_corner_case TIMEOUT (60) - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_fp8kv] TIMEOUT (60) - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency] TIMEOUT (60) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 1352c282ae7..556c19a0638 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -375,20 +375,6 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_m accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False] SKIP (https://nvbugs/5785206) accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized SKIP (https://nvbugs/5785465) accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8 SKIP (https://nvbugs/5785485) -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp1tp2cp2] SKIP (https://nvbugs/5787836) -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp2cp2] SKIP (https://nvbugs/5787836) -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1tp2cp2] SKIP (https://nvbugs/5787836) -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp2cp2] SKIP (https://nvbugs/5787836) -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp1tp1cp4] SKIP (https://nvbugs/5787836) -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp1cp4] SKIP (https://nvbugs/5787836) -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1tp1cp4] SKIP (https://nvbugs/5787836) -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp1cp4] SKIP (https://nvbugs/5787836) -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp2tp1cp2] SKIP (https://nvbugs/5787836) -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:without_padding-pp2tp1cp2] SKIP (https://nvbugs/5787836) -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp2tp1cp2] SKIP (https://nvbugs/5787836) -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp2tp1cp2] SKIP (https://nvbugs/5787836) -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:without_padding-pp2tp1cp2] SKIP (https://nvbugs/5787836) -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp2tp1cp2] SKIP (https://nvbugs/5787836) accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram SKIP (https://nvbugs/5769815) accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=False] SKIP (https://nvbugs/5787892) accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=False] SKIP (https://nvbugs/5787892)