Skip to content

Commit 268ea9b

Browse files
authored
[None][test] Add one-model and overlap-scheduling to eagle tests for GPTOSS (NVIDIA#9312)
Signed-off-by: Dongfeng Yu <[email protected]>
1 parent 15ceba8 commit 268ea9b

File tree

7 files changed

+66
-20
lines changed

7 files changed

+66
-20
lines changed

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3956,16 +3956,24 @@ def test_w4_chunked_prefill(self, kv_cache_dtype, moe_backend, mocker):
39563956
extra_evaluator_kwargs=extra_evaluator_kwargs)
39573957

39583958
@pytest.mark.skip_less_device(4)
3959+
@pytest.mark.parametrize("overlap_scheduler", [True, False],
3960+
ids=["overlap_scheduler", "no_overlap_scheduler"])
3961+
@pytest.mark.parametrize("one_model", [True, False],
3962+
ids=["one_model", "two_model"])
39593963
@pytest.mark.parametrize(
39603964
"moe_backend",
39613965
["CUTLASS",
39623966
pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"],
39633967
ids=["cutlass", "trtllm", "triton"])
3964-
def test_eagle3(self, moe_backend, mocker):
3968+
def test_eagle3(self, moe_backend, one_model, overlap_scheduler, mocker):
39653969
if moe_backend == "TRITON":
39663970
if not IS_TRITON_KERNELS_AVAILABLE:
39673971
pytest.skip("Triton kernels are not available")
39683972

3973+
if get_sm_version() == 90 and moe_backend == "CUTLASS":
3974+
pytest.skip(
3975+
"https://nvbugs/5636916: Remaining Hopper Eagle Accuracy Issue")
3976+
39693977
MAX_OUTPUT_LEN = 128179
39703978
MAX_INPUT_LEN = 32768
39713979

@@ -3977,16 +3985,16 @@ def test_eagle3(self, moe_backend, mocker):
39773985
mocker.patch.object(GPQADiamond, "MAX_INPUT_LEN", MAX_INPUT_LEN)
39783986

39793987
# https://nvbugs/5590408: 2-Model overlap scheduling has accuracy issue
3980-
pytorch_config = dict(disable_overlap_scheduler=True,
3988+
pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler,
39813989
cuda_graph_config=CudaGraphConfig())
3982-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6,
3990+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
39833991
dtype="auto")
39843992

39853993
eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3"
39863994
draft_len = 3
39873995
spec_config = EagleDecodingConfig(max_draft_len=draft_len,
39883996
speculative_model_dir=eagle_model_dir,
3989-
eagle3_one_model=False)
3997+
eagle3_one_model=one_model)
39903998

39913999
max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN
39924000
llm = LLM(self.MODEL_PATH,

tests/integration/test_lists/qa/llm_function_core.txt

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -564,9 +564,18 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-au
564564
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-auto]
565565
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto]
566566
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-fp8]
567-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm]
568-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass]
569-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton]
567+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-overlap_scheduler]
568+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-no_overlap_scheduler]
569+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-overlap_scheduler]
570+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-no_overlap_scheduler]
571+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-overlap_scheduler]
572+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-no_overlap_scheduler]
573+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-overlap_scheduler]
574+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-no_overlap_scheduler]
575+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-overlap_scheduler]
576+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-no_overlap_scheduler]
577+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-overlap_scheduler]
578+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-no_overlap_scheduler]
570579
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False]
571580
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True]
572581
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram

tests/integration/test_lists/qa/llm_function_core_sanity.txt

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,18 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-au
101101
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-auto]
102102
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto]
103103
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-fp8]
104-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm]
105-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass]
106-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton]
104+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-overlap_scheduler]
105+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-no_overlap_scheduler]
106+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-overlap_scheduler]
107+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-no_overlap_scheduler]
108+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-overlap_scheduler]
109+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-no_overlap_scheduler]
110+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-overlap_scheduler]
111+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-no_overlap_scheduler]
112+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-overlap_scheduler]
113+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-no_overlap_scheduler]
114+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-overlap_scheduler]
115+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-no_overlap_scheduler]
107116
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
108117
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
109118
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency]

tests/integration/test_lists/qa/llm_function_nim.txt

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -342,9 +342,18 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-au
342342
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-auto]
343343
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto]
344344
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-fp8]
345-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm]
346-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass]
347-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton]
345+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-overlap_scheduler]
346+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-no_overlap_scheduler]
347+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-overlap_scheduler]
348+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-no_overlap_scheduler]
349+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-overlap_scheduler]
350+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-no_overlap_scheduler]
351+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-overlap_scheduler]
352+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-no_overlap_scheduler]
353+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-overlap_scheduler]
354+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-no_overlap_scheduler]
355+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-overlap_scheduler]
356+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-no_overlap_scheduler]
348357
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency]
349358
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
350359
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]

tests/integration/test_lists/test-db/l0_dgx_b200.yml

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ l0_dgx_b200:
5050
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-fp8]
5151
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
5252
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-fp8]
53-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm]
54-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass]
53+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-overlap_scheduler]
54+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-overlap_scheduler]
5555
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16]
5656
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
5757
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
@@ -196,6 +196,12 @@ l0_dgx_b200:
196196
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-fp8]
197197
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
198198
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto]
199-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm]
200-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass]
199+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-overlap_scheduler]
200+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-no_overlap_scheduler]
201+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-overlap_scheduler]
202+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-no_overlap_scheduler]
203+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-overlap_scheduler]
204+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-no_overlap_scheduler]
205+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-overlap_scheduler]
206+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-no_overlap_scheduler]
201207
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]

tests/integration/test_lists/test-db/l0_dgx_h100.yml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,8 +185,14 @@ l0_dgx_h100:
185185
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
186186
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto]
187187
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
188-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass]
189-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton]
188+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-overlap_scheduler]
189+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-no_overlap_scheduler]
190+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-overlap_scheduler]
191+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-no_overlap_scheduler]
192+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-overlap_scheduler]
193+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-no_overlap_scheduler]
194+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-overlap_scheduler]
195+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-no_overlap_scheduler]
190196
- condition:
191197
ranges:
192198
system_gpu_count:

tests/integration/test_lists/waives.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,6 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_laten
346346
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp] SKIP (https://nvbugs/5637220)
347347
llmapi/test_llm_examples.py::test_llmapi_example_multilora SKIP (https://nvbugs/5636857)
348348
unittest/_torch/modules/test_mla_helix.py::test_mla_helix_distributed SKIP (https://nvbugspro.nvidia.com/bug/5637012)
349-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass] SKIP (https://nvbugs/5636916)
350349
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5616182)
351350
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-small-128k-instruct-fp8-bfloat16] SKIP (https://nvbugs/5465143)
352351
examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5644684)

0 commit comments

Comments
 (0)