diff --git a/cpp/kernels/xqa/mha.cu b/cpp/kernels/xqa/mha.cu index 89eb935cf3a..330364ee88f 100644 --- a/cpp/kernels/xqa/mha.cu +++ b/cpp/kernels/xqa/mha.cu @@ -466,20 +466,53 @@ using WarpAcc = WarpAccT; #define MMAS_N_PER_MASK 2 __device__ inline void applyMaskFromInput(Warp const& warp, WarpAcc& acc, MaskType const* mask, uint32_t rowOffset, - uint32_t nbValidCols, uint32_t qSeqLen, uint32_t actualQSeqLen, uint32_t headGrpSize) + uint32_t nbValidCols, uint32_t qSeqLen, uint32_t actualQSeqLen, uint32_t headGrpSize +#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE + , + int32_t tok0WinBeg, uint32_t seqIter, uint32_t const cacheSeqLen, uint32_t const warpTileTokenBeg +#endif +) { uint32_t const idxInQuad = laneId() % 4; uint32_t const idxQuad = laneId() / 4; // Packed mask is aligned with 32 bits (2 uint16_t). uint32_t const nbPackedMasksPerRow = divUp(qSeqLen, 32u) * 2u; uint16_t const* uint16Mask = reinterpret_cast(mask); + constexpr uint64_t fullMask = ~uint64_t{0}; +#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE + Range const tileRange = {warpTileTokenBeg, warpTileTokenBeg + warpTile.x}; + Range const maxMaskOutRange = {0, mha::max(0, tok0WinBeg) + (nbValidRows / MMAS_N_PER_MASK - 1)}; + bool const ctaNeedBegMask = tileRange.beg < maxMaskOutRange.end; + assert(ctaNeedBegMask == overlap(tileRange, maxMaskOutRange)); + int32_t const tok0NbMaskOut = int32_t(tok0WinBeg) - int32_t(warpTileTokenBeg); + uint32_t const nbSeqItersWithoutSpecDecMask = (cacheSeqLen - actualQSeqLen) / ctaTile.x; + bool const ctaNeedSpecDecMask = (seqIter >= nbSeqItersWithoutSpecDecMask); +#else + constexpr bool ctaNeedBegMask = false; + bool const ctaNeedSpecDecMask = true; + int32_t const tok0NbMaskOut = -2147483648; +#endif + bool const needMask = ctaNeedBegMask || ctaNeedSpecDecMask; + + if (!needMask) + { + return; + } #pragma unroll for (uint32_t m = 0; m < acc.rows; m++) { #pragma unroll for (uint32_t i = 0; i < InstAcc::rows; i++) { - uint32_t const tokenRow = min((rowOffset + instM * m + idxQuad + i * 8) / headGrpSize, actualQSeqLen - 1); + uint32_t const idxQTokInCta = (rowOffset + instM * m + idxQuad + i * 8) / headGrpSize; + uint32_t const tokenRow = min(idxQTokInCta, actualQSeqLen - 1); +#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE + int32_t const begNbMaskOut = tok0NbMaskOut + int32_t(idxQTokInCta); + uint64_t const begMask = (begNbMaskOut > 0 ? fullMask << begNbMaskOut : fullMask); +#else + uint64_t const begMask = fullMask; +#endif + #pragma unroll for (uint32_t mask_n = 0; mask_n < acc.cols / MMAS_N_PER_MASK; mask_n++) { @@ -491,12 +524,15 @@ __device__ inline void applyMaskFromInput(Warp const& warp, WarpAcc& acc, MaskTy uint32_t const maskPos1 = lastCol + actualQSeqLen < nbValidCols ? 0u : min(lastCol + actualQSeqLen - nbValidCols, actualQSeqLen - 1); - uint32_t packedMask = 0u; uint32_t const maskPosStart = (maskPos0 / 16) * 16; - reinterpret_cast(&packedMask)[0] - = uint16Mask[tokenRow * nbPackedMasksPerRow + (maskPos0 / 16)]; - reinterpret_cast(&packedMask)[1] - = uint16Mask[tokenRow * nbPackedMasksPerRow + (maskPos1 / 16)]; + uint32_t packedMask = ~uint32_t{0}; + if (ctaNeedSpecDecMask) + { + reinterpret_cast(&packedMask)[0] + = uint16Mask[tokenRow * nbPackedMasksPerRow + (maskPos0 / 16)]; + reinterpret_cast(&packedMask)[1] + = uint16Mask[tokenRow * nbPackedMasksPerRow + (maskPos1 / 16)]; + } #pragma unroll for (uint32_t nj = 0; nj < MMAS_N_PER_MASK; nj++) { @@ -510,7 +546,11 @@ __device__ inline void applyMaskFromInput(Warp const& warp, WarpAcc& acc, MaskTy bool const maskFlag = col + actualQSeqLen < nbValidCols ? true : packedMask & (1u << ((col + actualQSeqLen - nbValidCols) - maskPosStart)); - acc(m, n)(i, j) = maskFlag && col < nbValidCols ? acc(m, n)(i, j) : safeInitRowMax; + + bool const begMaskFlag = ctaNeedBegMask ? (begMask & (1ULL << col)) : true; + + acc(m, n)(i, j) + = maskFlag && begMaskFlag && col < nbValidCols ? acc(m, n)(i, j) : safeInitRowMax; } } } @@ -1611,8 +1651,14 @@ CUBIN_EXPORT __global__ #endif uint32_t const cacheSeqLen = getCacheSeqLen(cacheList, idxReq); -#if SLIDING_WINDOW +#if SLIDING_WINDOW && SPEC_DEC && !IS_SPEC_DEC_TREE + uint32_t const tok0SeqLen = cacheSeqLen - actualQSeqLen + 1 + idxHeadTokenInGrp; // ctaTokOffset; + int32_t const tok0WinBeg = int32_t(tok0SeqLen) - int32_t(slidingWinSize); + uint32_t const nbTotalSkipTokens = mha::max(0, tok0WinBeg); + +#elif SLIDING_WINDOW bool const rtIsReallySliding = (cacheSeqLen > slidingWinSize); + assert(!SPEC_DEC || !rtIsReallySliding); uint32_t const nbTotalSkipTokens = rtIsReallySliding ? cacheSeqLen - slidingWinSize : 0; #else constexpr bool rtIsReallySliding = false; @@ -1626,7 +1672,9 @@ CUBIN_EXPORT __global__ #endif uint32_t const nbSeqIters = useKVCache ? divUp(cacheSeqLen, ctaTile.x) : 0; -#if SPEC_DEC +#if SLIDING_WINDOW && SPEC_DEC && !IS_SPEC_DEC_TREE + uint32_t const nbSeqItersWithoutMask = nbSkipLeadingTiles; +#elif SPEC_DEC uint32_t const nbSeqItersWithoutMask = (cacheSeqLen - actualQSeqLen) / ctaTile.x; #endif @@ -1912,8 +1960,12 @@ CUBIN_EXPORT __global__ if (seqIter >= nbSeqItersWithoutMask) { uint32_t const nbValidCols = (warpTileTokenBeg < cacheSeqLen ? cacheSeqLen - warpTileTokenBeg : 0U); - applyMaskFromInput( - warp, acc, mask, idxHeadTokenInGrp, nbValidCols, qSeqLen, actualQSeqLen, headGrpSize); + applyMaskFromInput(warp, acc, mask, idxHeadTokenInGrp, nbValidCols, qSeqLen, actualQSeqLen, headGrpSize +#if SLIDING_WINDOW && !IS_SPEC_DEC_TREE + , + tok0WinBeg, seqIter, cacheSeqLen, warpTileTokenBeg +#endif + ); } #else bool const isFirstIter = (seqIter == nbSkipLeadingTiles); diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 26c7716ba83..2e3af6fa367 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -2895,6 +2895,7 @@ def launchTestJobs(pipeline, testFilter) x86SlurmTestConfigs = [ "DGX_H100-2_GPUs-PyTorch-Others-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2], + "DGX_H100-2_GPUs-PyTorch-GptOss-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2], "DGX_H100-2_GPUs-PyTorch-Ray-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2], "DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4], "DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4], diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py index d754eb701a8..6e14627d852 100644 --- a/tensorrt_llm/_torch/attention_backend/trtllm.py +++ b/tensorrt_llm/_torch/attention_backend/trtllm.py @@ -475,7 +475,7 @@ def run( self.spec_decoding_generation_lengths, self.spec_decoding_position_offsets, self.spec_decoding_packed_mask ] - if get_sm_version() >= 100: + if self.is_sm_version_trtllm_gen_kernel(sm=get_sm_version()): spec_decoding_tensor_params.append( self.spec_decoding_bl_tree_mask_offset) spec_decoding_tensor_params.append(self.spec_decoding_bl_tree_mask) @@ -1219,12 +1219,12 @@ def update_spec_dec_param( # spec_dec mode should only be enabled for non-sm100 machines and when there's a spec-dec tree. self.is_spec_decoding_enabled = is_spec_decoding_enabled and ( - get_sm_version() < 100 or get_sm_version() == 120) + not self.is_sm_version_trtllm_gen_kernel(sm=get_sm_version())) self.is_spec_dec_tree = spec_tree_manager is not None self.is_spec_dec_dynamic_tree = spec_tree_manager is not None and spec_tree_manager.use_dynamic_tree - if get_sm_version() >= 100 and get_sm_version() != 120: + if self.is_sm_version_trtllm_gen_kernel(sm=get_sm_version()): if self.is_spec_dec_tree or self.is_spec_dec_dynamic_tree: assert not self.is_spec_dec_tree, "Spec-dec tree is not supported on this machine. Please use a pre-Blackwell machine for a spec-dec tree." @@ -1260,7 +1260,7 @@ def update_spec_dec_param( device='cuda', ) - if get_sm_version() >= 100: + if self.is_sm_version_trtllm_gen_kernel(sm=get_sm_version()): self.spec_decoding_param_prepare_for_blackwell() else: self.spec_decoding_bl_tree_mask_offset = None @@ -1371,6 +1371,9 @@ def generate_spec_decoding_generation_length(self, max_draft_len): self.spec_decoding_generation_lengths[:self.max_num_requests].copy_( spec_decoding_generation_length, non_blocking=True) + def is_sm_version_trtllm_gen_kernel(self, sm): + return not (sm < 100 or sm in [120, 121]) + class TrtllmAttention(AttentionBackend[TrtllmAttentionMetadata]): diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 24bc65b5e13..35e60e04360 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -4248,14 +4248,16 @@ def test_w4_chunked_prefill(self, kv_cache_dtype, moe_backend, mocker): ["CUTLASS", pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"], ids=["cutlass", "trtllm", "triton"]) - def test_eagle3(self, moe_backend, one_model, overlap_scheduler, mocker): + def test_eagle3_4gpus(self, moe_backend, one_model, overlap_scheduler, + mocker): if moe_backend == "TRITON": if not IS_TRITON_KERNELS_AVAILABLE: pytest.skip("Triton kernels are not available") - if get_sm_version() == 90 and moe_backend == "CUTLASS": + if get_sm_version() == 90: pytest.skip( - "https://nvbugs/5636916: Remaining Hopper Eagle Accuracy Issue") + "https://nvbugs/5636916: Remaining Hopper Eagle Accuracy Issue for only TP=4" + ) MAX_OUTPUT_LEN = 128179 MAX_INPUT_LEN = 32768 @@ -4318,6 +4320,86 @@ def test_eagle3(self, moe_backend, one_model, overlap_scheduler, mocker): sampling_params=sampling_params, extra_evaluator_kwargs=extra_evaluator_kwargs) + @pytest.mark.skip_less_device(2) + @pytest.mark.timeout(14400) + @pytest.mark.parametrize("overlap_scheduler", [True, False], + ids=["overlap_scheduler", "no_overlap_scheduler"]) + @pytest.mark.parametrize("one_model", [True, False], + ids=["one_model", "two_model"]) + @pytest.mark.parametrize( + "moe_backend", + ["CUTLASS", + pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"], + ids=["cutlass", "trtllm", "triton"]) + def test_eagle3_2gpus(self, moe_backend, one_model, overlap_scheduler, + mocker): + if moe_backend == "TRITON": + if not IS_TRITON_KERNELS_AVAILABLE: + pytest.skip("Triton kernels are not available") + + MAX_OUTPUT_LEN = 128179 + MAX_INPUT_LEN = 32768 + + mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192) + mocker.patch.dict(GSM8K.EVALUATE_KWARGS, + {"scores_filter": "exact_match,flexible-extract"}) + + mocker.patch.object(GPQADiamond, "MAX_OUTPUT_LEN", MAX_OUTPUT_LEN) + mocker.patch.object(GPQADiamond, "MAX_INPUT_LEN", MAX_INPUT_LEN) + + # https://nvbugs/5590408: 2-Model overlap scheduling has accuracy issue + pytorch_config = dict( + max_batch_size=8, + disable_overlap_scheduler=not overlap_scheduler, + cuda_graph_config=CudaGraphConfig(max_batch_size=8)) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4, + dtype="auto") + + eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3" + draft_len = 3 + spec_config = EagleDecodingConfig(max_draft_len=draft_len, + speculative_model_dir=eagle_model_dir, + eagle3_one_model=one_model) + + max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN + llm = LLM(self.MODEL_PATH, + tensor_parallel_size=2, + pipeline_parallel_size=1, + moe_expert_parallel_size=1, + kv_cache_config=kv_cache_config, + max_seq_len=max_seq_len, + speculative_config=spec_config, + **pytorch_config, + enable_attention_dp=False, + moe_config=MoeConfig(backend=moe_backend)) + + with llm: + model_name = "GPT-OSS/120B-MXFP4" + + # GSM8K + task = GSM8K(model_name) + task.evaluate(llm, + extra_evaluator_kwargs=self.extra_evaluator_kwargs) + + # GPQA Medium Reasoning + task = GPQADiamond(model_name) + + chat_template_kwargs = dict(reasoning_effort="medium") + extra_evaluator_kwargs = { + **self.extra_evaluator_kwargs, "chat_template_kwargs": + chat_template_kwargs + } + + sampling_params = SamplingParams( + temperature=1.0, + top_p=1.0, + max_tokens=MAX_OUTPUT_LEN, + truncate_prompt_tokens=MAX_INPUT_LEN) + + task.evaluate(llm, + sampling_params=sampling_params, + extra_evaluator_kwargs=extra_evaluator_kwargs) + @pytest.mark.skip_less_device(4) @pytest.mark.skip_device_not_contain(["GB200"]) @pytest.mark.parametrize( diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index 92653058cd7..7e58d0f5007 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -566,18 +566,18 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-au accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-auto] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-fp8] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-no_overlap_scheduler] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram diff --git a/tests/integration/test_lists/qa/llm_function_core_sanity.txt b/tests/integration/test_lists/qa/llm_function_core_sanity.txt index 6d32579f042..f468d262b13 100644 --- a/tests/integration/test_lists/qa/llm_function_core_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_core_sanity.txt @@ -103,18 +103,18 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-au accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-auto] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-fp8] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-no_overlap_scheduler] accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency] diff --git a/tests/integration/test_lists/qa/llm_function_nim.txt b/tests/integration/test_lists/qa/llm_function_nim.txt index 77f0563016c..357cc80a055 100644 --- a/tests/integration/test_lists/qa/llm_function_nim.txt +++ b/tests/integration/test_lists/qa/llm_function_nim.txt @@ -342,18 +342,18 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-au accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-auto] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-fp8] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-no_overlap_scheduler] accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency] accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency] accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index 7bac4b180f1..04a4278ba6f 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -57,8 +57,9 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-fp8] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-fp8] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler] - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4 @@ -207,12 +208,12 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus_online_eplb[enable_configurable_moe-fp8] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-no_overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-no_overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-no_overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-no_overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-no_overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-no_overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-no_overlap_scheduler] - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index 7e50c6ebf88..c544501a9d0 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -45,6 +45,25 @@ l0_dgx_h100: - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-2] # llmapi - unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks +- condition: + ranges: + system_gpu_count: + gte: 2 + lte: 2 + wildcards: + gpu: + - '*h100*' + linux_distribution_name: ubuntu* + terms: + stage: pre_merge + backend: pytorch + auto_trigger: gpt_oss + orchestrator: mpi + tests: + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[cutlass-one_model-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[cutlass-two_model-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[triton-one_model-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[triton-two_model-overlap_scheduler] - condition: ranges: system_gpu_count: @@ -186,14 +205,6 @@ l0_dgx_h100: - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-no_overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-no_overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-no_overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-overlap_scheduler] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-no_overlap_scheduler] - condition: ranges: system_gpu_count: diff --git a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml index 58200ca9014..63deed9f869 100644 --- a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml +++ b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml @@ -109,3 +109,7 @@ l0_rtx_pro_6000: # - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False] # failed - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=True] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-no_overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-overlap_scheduler]