Skip to content

Commit c53bc19

Browse files
authored
[infra] Make test_chunked_prefill faster (NVIDIA#5248)
Signed-off-by: Mike Iovine <[email protected]>
1 parent 5c18160 commit c53bc19

File tree

1 file changed

+5
-4
lines changed

1 file changed

+5
-4
lines changed

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,16 +61,17 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
6161
@pytest.mark.skip_less_device_memory(32000)
6262
@parametrize_with_ids("attn_backend", ["TRTLLM", "FLASHINFER"])
6363
def test_chunked_prefill(self, attn_backend):
64-
pytorch_config = dict(attn_backend=attn_backend, )
64+
pytorch_config = dict(
65+
attn_backend=attn_backend,
66+
# https://nvbugspro.nvidia.com/bug/5345391
67+
disable_overlap_scheduler=True)
6568
llm = LLM(self.MODEL_PATH,
6669
enable_chunked_prefill=True,
67-
max_num_tokens=64,
70+
max_num_tokens=512,
6871
**pytorch_config)
6972
with llm:
7073
task = MMLU(self.MODEL_NAME)
7174
task.evaluate(llm)
72-
task = GSM8K(self.MODEL_NAME)
73-
task.evaluate(llm)
7475

7576
@pytest.mark.skip_less_device_memory(32000)
7677
@parametrize_with_ids(

0 commit comments

Comments
 (0)