Skip to content

Commit 5b8c434

Browse files
committed
add eagle3 gpt-oss test
Signed-off-by: Jhao-Ting Chen <jhaotingc@nvidia.com>
1 parent 112e5bd commit 5b8c434

File tree

9 files changed

+158
-59
lines changed

9 files changed

+158
-59
lines changed

jenkins/L0_Test.groovy

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2894,6 +2894,7 @@ def launchTestJobs(pipeline, testFilter)
28942894

28952895
x86SlurmTestConfigs = [
28962896
"DGX_H100-2_GPUs-PyTorch-Others-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2],
2897+
"DGX_H100-2_GPUs-PyTorch-GptOss-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2],
28972898
"DGX_H100-2_GPUs-PyTorch-Ray-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2],
28982899
"DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
28992900
"DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],

tensorrt_llm/_torch/attention_backend/trtllm.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -475,7 +475,7 @@ def run(
475475
self.spec_decoding_generation_lengths,
476476
self.spec_decoding_position_offsets, self.spec_decoding_packed_mask
477477
]
478-
if get_sm_version() >= 100:
478+
if get_sm_version() == 100:
479479
spec_decoding_tensor_params.append(
480480
self.spec_decoding_bl_tree_mask_offset)
481481
spec_decoding_tensor_params.append(self.spec_decoding_bl_tree_mask)
@@ -1219,12 +1219,12 @@ def update_spec_dec_param(
12191219

12201220
# spec_dec mode should only be enabled for non-sm100 machines and when there's a spec-dec tree.
12211221
self.is_spec_decoding_enabled = is_spec_decoding_enabled and (
1222-
get_sm_version() < 100 or get_sm_version() == 120)
1222+
get_sm_version() != 100)
12231223

12241224
self.is_spec_dec_tree = spec_tree_manager is not None
12251225
self.is_spec_dec_dynamic_tree = spec_tree_manager is not None and spec_tree_manager.use_dynamic_tree
12261226

1227-
if get_sm_version() >= 100 and get_sm_version() != 120:
1227+
if get_sm_version() == 100:
12281228
if self.is_spec_dec_tree or self.is_spec_dec_dynamic_tree:
12291229
assert not self.is_spec_dec_tree, "Spec-dec tree is not supported on this machine. Please use a pre-Blackwell machine for a spec-dec tree."
12301230

@@ -1260,7 +1260,7 @@ def update_spec_dec_param(
12601260
device='cuda',
12611261
)
12621262

1263-
if get_sm_version() >= 100:
1263+
if get_sm_version() == 100:
12641264
self.spec_decoding_param_prepare_for_blackwell()
12651265
else:
12661266
self.spec_decoding_bl_tree_mask_offset = None

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 85 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4177,14 +4177,16 @@ def test_w4_chunked_prefill(self, kv_cache_dtype, moe_backend, mocker):
41774177
["CUTLASS",
41784178
pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"],
41794179
ids=["cutlass", "trtllm", "triton"])
4180-
def test_eagle3(self, moe_backend, one_model, overlap_scheduler, mocker):
4180+
def test_eagle3_4gpus(self, moe_backend, one_model, overlap_scheduler,
4181+
mocker):
41814182
if moe_backend == "TRITON":
41824183
if not IS_TRITON_KERNELS_AVAILABLE:
41834184
pytest.skip("Triton kernels are not available")
41844185

4185-
if get_sm_version() == 90 and moe_backend == "CUTLASS":
4186+
if get_sm_version() == 90:
41864187
pytest.skip(
4187-
"https://nvbugs/5636916: Remaining Hopper Eagle Accuracy Issue")
4188+
"https://nvbugs/5636916: Remaining Hopper Eagle Accuracy Issue for only TP=4"
4189+
)
41884190

41894191
MAX_OUTPUT_LEN = 128179
41904192
MAX_INPUT_LEN = 32768
@@ -4247,6 +4249,86 @@ def test_eagle3(self, moe_backend, one_model, overlap_scheduler, mocker):
42474249
sampling_params=sampling_params,
42484250
extra_evaluator_kwargs=extra_evaluator_kwargs)
42494251

4252+
@pytest.mark.skip_less_device(2)
4253+
@pytest.mark.timeout(14400)
4254+
@pytest.mark.parametrize("overlap_scheduler", [True, False],
4255+
ids=["overlap_scheduler", "no_overlap_scheduler"])
4256+
@pytest.mark.parametrize("one_model", [True, False],
4257+
ids=["one_model", "two_model"])
4258+
@pytest.mark.parametrize(
4259+
"moe_backend",
4260+
["CUTLASS",
4261+
pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"],
4262+
ids=["cutlass", "trtllm", "triton"])
4263+
def test_eagle3_2gpus(self, moe_backend, one_model, overlap_scheduler,
4264+
mocker):
4265+
if moe_backend == "TRITON":
4266+
if not IS_TRITON_KERNELS_AVAILABLE:
4267+
pytest.skip("Triton kernels are not available")
4268+
4269+
MAX_OUTPUT_LEN = 128179
4270+
MAX_INPUT_LEN = 32768
4271+
4272+
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
4273+
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
4274+
{"scores_filter": "exact_match,flexible-extract"})
4275+
4276+
mocker.patch.object(GPQADiamond, "MAX_OUTPUT_LEN", MAX_OUTPUT_LEN)
4277+
mocker.patch.object(GPQADiamond, "MAX_INPUT_LEN", MAX_INPUT_LEN)
4278+
4279+
# https://nvbugs/5590408: 2-Model overlap scheduling has accuracy issue
4280+
pytorch_config = dict(
4281+
max_batch_size=8,
4282+
disable_overlap_scheduler=not overlap_scheduler,
4283+
cuda_graph_config=CudaGraphConfig(max_batch_size=8))
4284+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
4285+
dtype="auto")
4286+
4287+
eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3"
4288+
draft_len = 3
4289+
spec_config = EagleDecodingConfig(max_draft_len=draft_len,
4290+
speculative_model_dir=eagle_model_dir,
4291+
eagle3_one_model=one_model)
4292+
4293+
max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN
4294+
llm = LLM(self.MODEL_PATH,
4295+
tensor_parallel_size=2,
4296+
pipeline_parallel_size=1,
4297+
moe_expert_parallel_size=1,
4298+
kv_cache_config=kv_cache_config,
4299+
max_seq_len=max_seq_len,
4300+
speculative_config=spec_config,
4301+
**pytorch_config,
4302+
enable_attention_dp=False,
4303+
moe_config=MoeConfig(backend=moe_backend))
4304+
4305+
with llm:
4306+
model_name = "GPT-OSS/120B-MXFP4"
4307+
4308+
# GSM8K
4309+
task = GSM8K(model_name)
4310+
task.evaluate(llm,
4311+
extra_evaluator_kwargs=self.extra_evaluator_kwargs)
4312+
4313+
# GPQA Medium Reasoning
4314+
task = GPQADiamond(model_name)
4315+
4316+
chat_template_kwargs = dict(reasoning_effort="medium")
4317+
extra_evaluator_kwargs = {
4318+
**self.extra_evaluator_kwargs, "chat_template_kwargs":
4319+
chat_template_kwargs
4320+
}
4321+
4322+
sampling_params = SamplingParams(
4323+
temperature=1.0,
4324+
top_p=1.0,
4325+
max_tokens=MAX_OUTPUT_LEN,
4326+
truncate_prompt_tokens=MAX_INPUT_LEN)
4327+
4328+
task.evaluate(llm,
4329+
sampling_params=sampling_params,
4330+
extra_evaluator_kwargs=extra_evaluator_kwargs)
4331+
42504332
@pytest.mark.skip_less_device(4)
42514333
@pytest.mark.skip_device_not_contain(["GB200"])
42524334
@pytest.mark.parametrize(

tests/integration/test_lists/qa/llm_function_core.txt

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -566,18 +566,18 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-au
566566
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-auto]
567567
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto]
568568
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-fp8]
569-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-overlap_scheduler]
570-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-no_overlap_scheduler]
571-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-overlap_scheduler]
572-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-no_overlap_scheduler]
573-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-overlap_scheduler]
574-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-no_overlap_scheduler]
575-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-overlap_scheduler]
576-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-no_overlap_scheduler]
577-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-overlap_scheduler]
578-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-no_overlap_scheduler]
579-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-overlap_scheduler]
580-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-no_overlap_scheduler]
569+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler]
570+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-no_overlap_scheduler]
571+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler]
572+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-no_overlap_scheduler]
573+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler]
574+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-no_overlap_scheduler]
575+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-overlap_scheduler]
576+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler]
577+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-overlap_scheduler]
578+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler]
579+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-overlap_scheduler]
580+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-no_overlap_scheduler]
581581
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False]
582582
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True]
583583
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram

tests/integration/test_lists/qa/llm_function_core_sanity.txt

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -103,18 +103,18 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-au
103103
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-auto]
104104
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto]
105105
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-fp8]
106-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-overlap_scheduler]
107-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-no_overlap_scheduler]
108-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-overlap_scheduler]
109-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-no_overlap_scheduler]
110-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-overlap_scheduler]
111-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-no_overlap_scheduler]
112-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-overlap_scheduler]
113-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-no_overlap_scheduler]
114-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-overlap_scheduler]
115-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-no_overlap_scheduler]
116-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-overlap_scheduler]
117-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-no_overlap_scheduler]
106+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler]
107+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-no_overlap_scheduler]
108+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler]
109+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-no_overlap_scheduler]
110+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler]
111+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-no_overlap_scheduler]
112+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-overlap_scheduler]
113+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler]
114+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-overlap_scheduler]
115+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler]
116+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-overlap_scheduler]
117+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-no_overlap_scheduler]
118118
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
119119
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
120120
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency]

tests/integration/test_lists/qa/llm_function_nim.txt

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -342,18 +342,18 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-au
342342
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-auto]
343343
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto]
344344
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-fp8]
345-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-overlap_scheduler]
346-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-no_overlap_scheduler]
347-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-overlap_scheduler]
348-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-no_overlap_scheduler]
349-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-overlap_scheduler]
350-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-no_overlap_scheduler]
351-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-overlap_scheduler]
352-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-no_overlap_scheduler]
353-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-overlap_scheduler]
354-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model-no_overlap_scheduler]
355-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-overlap_scheduler]
356-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model-no_overlap_scheduler]
345+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler]
346+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-no_overlap_scheduler]
347+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler]
348+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-no_overlap_scheduler]
349+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler]
350+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-no_overlap_scheduler]
351+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-overlap_scheduler]
352+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler]
353+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-overlap_scheduler]
354+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler]
355+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-overlap_scheduler]
356+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-no_overlap_scheduler]
357357
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency]
358358
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
359359
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]

tests/integration/test_lists/test-db/l0_dgx_b200.yml

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,9 @@ l0_dgx_b200:
5353
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-fp8]
5454
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
5555
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-fp8]
56-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-overlap_scheduler]
57-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-overlap_scheduler]
56+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-overlap_scheduler]
57+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler]
58+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler]
5859
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
5960
- accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
6061
- accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
@@ -197,12 +198,12 @@ l0_dgx_b200:
197198
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-fp8]
198199
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
199200
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto]
200-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-overlap_scheduler]
201-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-no_overlap_scheduler]
202-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-no_overlap_scheduler]
203-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model-no_overlap_scheduler]
204-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-overlap_scheduler]
205-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model-no_overlap_scheduler]
201+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler]
202+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-no_overlap_scheduler]
203+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler]
204+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-no_overlap_scheduler]
205+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler]
206+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-no_overlap_scheduler]
206207
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]
207208
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16]
208209
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]

0 commit comments

Comments
 (0)