Skip to content

Commit bc5780f

Browse files
committed
[NVIDIA#10707][fix] AutoDeploy: Super accuracy test fixes
- Initial PR NVIDIA#10308 added wrong test name in L0 config files - fix - Add fp8 test - Add (disabled) fp4 test - Slightly decrease bf16 mmlu to accommodate autodeploy test Signed-off-by: Gal Hubara Agam <96368689+galagam@users.noreply.github.com>
1 parent e12a711 commit bc5780f

File tree

4 files changed

+52
-4
lines changed

4 files changed

+52
-4
lines changed

tests/integration/defs/accuracy/references/mmlu.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -349,10 +349,10 @@ mistral/Mistral-Large-3-675B:
349349
- spec_dec_algo: Eagle
350350
accuracy: 85.30
351351
nvidia/Nemotron-Super-V3:
352-
- accuracy: 81.07
352+
- accuracy: 80.00
353353
- quant_algo: FP8
354354
kv_cache_quant_algo: FP8
355-
accuracy: 78.22
355+
accuracy: 77.80
356356
- quant_algo: NVFP4
357357
kv_cache_quant_algo: FP8
358358
accuracy: 77.56

tests/integration/defs/accuracy/test_llm_api_autodeploy.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from tensorrt_llm.quantization import QuantAlgo
2121
from tensorrt_llm.sampling_params import SamplingParams
2222

23+
from ..conftest import get_device_count, llm_models_root
2324
from .accuracy_core import GSM8K, MMLU, CnnDailymail, LlmapiAccuracyTestHarness
2425

2526

@@ -226,6 +227,9 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
226227

227228
MODEL_NAME = "nvidia/Nemotron-Super-V3"
228229
MODEL_PATH_BF16 = f"{llm_models_root()}/Nemotron-Super-3-120B-A12B-dev"
230+
MODEL_PATH_FP8 = f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-fp8-fp8kv"
231+
MODEL_PATH_FP4 = f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-nvfp4-fp8kv"
232+
229233
# Set minimum possible seq len + small buffer, for test speed & memory usage
230234
MAX_SEQ_LEN = max(MMLU.MAX_INPUT_LEN + MMLU.MAX_OUTPUT_LEN,
231235
GSM8K.MAX_INPUT_LEN + GSM8K.MAX_OUTPUT_LEN)
@@ -271,3 +275,45 @@ def test_bf16(self):
271275
task.evaluate(llm, sampling_params=sampling_params)
272276
task = GSM8K(self.MODEL_NAME)
273277
task.evaluate(llm)
278+
279+
@pytest.mark.skip_less_device_memory(180000)
280+
@pytest.mark.skip_less_device(4)
281+
@pytest.mark.parametrize("world_size", [4, 8])
282+
def test_fp8(self, world_size):
283+
if get_device_count() < world_size:
284+
pytest.skip("Not enough devices for world size, skipping test")
285+
kwargs = self.get_default_kwargs()
286+
sampling_params = self.get_default_sampling_params()
287+
with AutoDeployLLM(model=self.MODEL_PATH_FP8,
288+
tokenizer=self.MODEL_PATH_FP8,
289+
world_size=world_size,
290+
**kwargs) as llm:
291+
# Manually set quant_config for FP8 model to get the accuracy threshold
292+
llm.args.quant_config.quant_algo = QuantAlgo.FP8
293+
llm.args.quant_config.kv_cache_quant_algo = QuantAlgo.FP8
294+
295+
task = MMLU(self.MODEL_NAME)
296+
task.evaluate(llm, sampling_params=sampling_params)
297+
task = GSM8K(self.MODEL_NAME)
298+
task.evaluate(llm)
299+
300+
@pytest.mark.skip("Skipping FP4 test until it is supported")
301+
@pytest.mark.skip_less_device_memory(180000)
302+
@pytest.mark.parametrize("world_size", [1, 4, 8])
303+
def test_fp4(self, world_size):
304+
if get_device_count() < world_size:
305+
pytest.skip("Not enough devices for world size, skipping test")
306+
kwargs = self.get_default_kwargs()
307+
sampling_params = self.get_default_sampling_params()
308+
with AutoDeployLLM(model=self.MODEL_PATH_FP4,
309+
tokenizer=self.MODEL_PATH_FP4,
310+
world_size=world_size,
311+
**kwargs) as llm:
312+
# Manually set quant_config for FP4 model to get the accuracy threshold
313+
llm.args.quant_config.quant_algo = QuantAlgo.NVFP4
314+
llm.args.quant_config.kv_cache_quant_algo = QuantAlgo.NVFP4
315+
316+
task = MMLU(self.MODEL_NAME)
317+
task.evaluate(llm, sampling_params=sampling_params)
318+
task = GSM8K(self.MODEL_NAME)
319+
task.evaluate(llm)

tests/integration/test_lists/test-db/l0_dgx_b200.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,4 +218,5 @@ l0_dgx_b200:
218218
tests:
219219
- unittest/_torch/auto_deploy/unit/multigpu
220220
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4]
221-
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
221+
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16[4]
222+
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[4]

tests/integration/test_lists/test-db/l0_dgx_h100.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,4 +321,5 @@ l0_dgx_h100:
321321
tests:
322322
- unittest/_torch/auto_deploy/unit/multigpu
323323
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4]
324-
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
324+
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16[4]
325+
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[4]

0 commit comments

Comments
 (0)