Skip to content

Commit c5a1d61

Browse files
committed
[NVIDIA#10707][fix] AutoDeploy: Super accuracy test fixes
- Initial PR NVIDIA#10308 added wrong test name in L0 config files - fix - Add fp8 test - Add (disabled) fp4 test - Slightly decrease bf16 mmlu to accommodate autodeploy test Signed-off-by: Gal Hubara Agam <96368689+galagam@users.noreply.github.com>
1 parent 26bc168 commit c5a1d61

File tree

4 files changed

+52
-4
lines changed

4 files changed

+52
-4
lines changed

tests/integration/defs/accuracy/references/mmlu.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -349,10 +349,10 @@ mistral/Mistral-Large-3-675B:
349349
- spec_dec_algo: Eagle
350350
accuracy: 85.30
351351
nvidia/Nemotron-Super-V3:
352-
- accuracy: 81.07
352+
- accuracy: 80.00
353353
- quant_algo: FP8
354354
kv_cache_quant_algo: FP8
355-
accuracy: 78.22
355+
accuracy: 77.80
356356
- quant_algo: NVFP4
357357
kv_cache_quant_algo: FP8
358358
accuracy: 77.56

tests/integration/defs/accuracy/test_llm_api_autodeploy.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from tensorrt_llm.quantization import QuantAlgo
2121
from tensorrt_llm.sampling_params import SamplingParams
2222

23+
from ..conftest import get_device_count, llm_models_root
2324
from .accuracy_core import GSM8K, MMLU, CnnDailymail, LlmapiAccuracyTestHarness
2425

2526

@@ -244,6 +245,9 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
244245

245246
MODEL_NAME = "nvidia/Nemotron-Super-V3"
246247
MODEL_PATH_BF16 = f"{llm_models_root()}/Nemotron-Super-3-120B-A12B-dev"
248+
MODEL_PATH_FP8 = f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-fp8-fp8kv"
249+
MODEL_PATH_FP4 = f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-nvfp4-fp8kv"
250+
247251
# Set minimum possible seq len + small buffer, for test speed & memory usage
248252
MAX_SEQ_LEN = max(MMLU.MAX_INPUT_LEN + MMLU.MAX_OUTPUT_LEN,
249253
GSM8K.MAX_INPUT_LEN + GSM8K.MAX_OUTPUT_LEN)
@@ -289,3 +293,45 @@ def test_bf16(self):
289293
task.evaluate(llm, sampling_params=sampling_params)
290294
task = GSM8K(self.MODEL_NAME)
291295
task.evaluate(llm)
296+
297+
@pytest.mark.skip_less_device_memory(180000)
298+
@pytest.mark.skip_less_device(4)
299+
@pytest.mark.parametrize("world_size", [4, 8])
300+
def test_fp8(self, world_size):
301+
if get_device_count() < world_size:
302+
pytest.skip("Not enough devices for world size, skipping test")
303+
kwargs = self.get_default_kwargs()
304+
sampling_params = self.get_default_sampling_params()
305+
with AutoDeployLLM(model=self.MODEL_PATH_FP8,
306+
tokenizer=self.MODEL_PATH_FP8,
307+
world_size=world_size,
308+
**kwargs) as llm:
309+
# Manually set quant_config for FP8 model to get the accuracy threshold
310+
llm.args.quant_config.quant_algo = QuantAlgo.FP8
311+
llm.args.quant_config.kv_cache_quant_algo = QuantAlgo.FP8
312+
313+
task = MMLU(self.MODEL_NAME)
314+
task.evaluate(llm, sampling_params=sampling_params)
315+
task = GSM8K(self.MODEL_NAME)
316+
task.evaluate(llm)
317+
318+
@pytest.mark.skip("Skipping FP4 test until it is supported")
319+
@pytest.mark.skip_less_device_memory(180000)
320+
@pytest.mark.parametrize("world_size", [1, 4, 8])
321+
def test_fp4(self, world_size):
322+
if get_device_count() < world_size:
323+
pytest.skip("Not enough devices for world size, skipping test")
324+
kwargs = self.get_default_kwargs()
325+
sampling_params = self.get_default_sampling_params()
326+
with AutoDeployLLM(model=self.MODEL_PATH_FP4,
327+
tokenizer=self.MODEL_PATH_FP4,
328+
world_size=world_size,
329+
**kwargs) as llm:
330+
# Manually set quant_config for FP4 model to get the accuracy threshold
331+
llm.args.quant_config.quant_algo = QuantAlgo.NVFP4
332+
llm.args.quant_config.kv_cache_quant_algo = QuantAlgo.NVFP4
333+
334+
task = MMLU(self.MODEL_NAME)
335+
task.evaluate(llm, sampling_params=sampling_params)
336+
task = GSM8K(self.MODEL_NAME)
337+
task.evaluate(llm)

tests/integration/test_lists/test-db/l0_dgx_b200.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,4 +218,5 @@ l0_dgx_b200:
218218
tests:
219219
- unittest/_torch/auto_deploy/unit/multigpu
220220
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4]
221-
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
221+
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16[4]
222+
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[4]

tests/integration/test_lists/test-db/l0_dgx_h100.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -322,4 +322,5 @@ l0_dgx_h100:
322322
tests:
323323
- unittest/_torch/auto_deploy/unit/multigpu
324324
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4]
325-
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
325+
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16[4]
326+
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[4]

0 commit comments

Comments
 (0)