Skip to content

Commit 6bdad5e

Browse files
committed
add gptoss20b and fix nemo nano
Signed-off-by: list <58580514+farazkh80@users.noreply.github.com>
1 parent f171d29 commit 6bdad5e

File tree

3 files changed

+33
-4
lines changed

3 files changed

+33
-4
lines changed

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3958,6 +3958,34 @@ def test_w4_1gpu(self, kv_cache_dtype, moe_backend, cuda_graph,
39583958
task.evaluate(llm,
39593959
extra_evaluator_kwargs=self.extra_evaluator_kwargs)
39603960

3961+
# on spark 120b accuracy takes 2.2 hours, so we do 20b for now
3962+
def test_w4_1gpu_20b_spark(self, mocker):
3963+
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
3964+
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
3965+
{"scores_filter": "exact_match,flexible-extract"})
3966+
3967+
pytorch_config = dict(
3968+
disable_overlap_scheduler=False,
3969+
cuda_graph_config=CudaGraphConfig())
3970+
3971+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5,
3972+
dtype="auto")
3973+
3974+
model_path = f"{llm_models_root()}/gpt_oss/gpt-oss-20b"
3975+
llm = LLM(model_path,
3976+
tensor_parallel_size=1,
3977+
pipeline_parallel_size=1,
3978+
moe_expert_parallel_size=1,
3979+
kv_cache_config=kv_cache_config,
3980+
**pytorch_config,
3981+
moe_config=MoeConfig(backend="CUTLASS"))
3982+
3983+
with llm:
3984+
model_name = "GPT-OSS/20B-MXFP4"
3985+
task = GSM8K(model_name)
3986+
task.evaluate(llm,
3987+
extra_evaluator_kwargs=self.extra_evaluator_kwargs)
3988+
39613989
def test_dummy_load_format(self):
39623990
llm = LLM(
39633991
self.MODEL_PATH,

tests/integration/defs/test_e2e.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1923,7 +1923,7 @@ def test_ptp_quickstart(llm_root, llm_venv):
19231923
def test_ptp_quickstart_advanced(llm_root, llm_venv, model_name, model_path):
19241924
print(f"Testing {model_name}.")
19251925
example_root = Path(os.path.join(llm_root, "examples", "llm-api"))
1926-
if model_name == "Nemotron-H-8B":
1926+
if model_name in ("Nemotron-H-8B", "Nemotron-Nano-v2-nvfp4"):
19271927
llm_venv.run_cmd([
19281928
str(example_root / "quickstart_advanced.py"),
19291929
"--disable_kv_cache_reuse",

tests/integration/test_lists/test-db/l0_gb10.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@ l0_gb10:
1515
backend: pytorch
1616
tests:
1717
# ------------- PyTorch tests ---------------
18-
- unittest/_torch/modeling -k "modeling_mllama"
19-
- unittest/_torch/modeling -k "modeling_out_of_tree"
20-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto]
18+
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_auto_dtype
19+
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
20+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu_20b_spark
2121
- condition:
2222
ranges:
2323
system_gpu_count:
@@ -39,6 +39,7 @@ l0_gb10:
3939
- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]
4040
- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]
4141
- test_e2e.py::test_ptp_quickstart_advanced[GPT-OSS-20B-gpt_oss/gpt-oss-20b]
42+
- test_e2e.py::test_ptp_quickstart_advanced[GPT-OSS-120B-gpt_oss/gpt-oss-120b]
4243
- test_e2e.py::test_ptp_quickstart_advanced[Qwen3-8b-fp8-Qwen3/nvidia-Qwen3-8B-FP8]
4344
- test_e2e.py::test_ptp_quickstart_advanced[Qwen3-8b-nvfp4-Qwen3/nvidia-Qwen3-8B-NVFP4]
4445
- test_e2e.py::test_ptp_quickstart_advanced[Qwen3-14b-fp8-Qwen3/nvidia-Qwen3-14B-FP8]

0 commit comments

Comments
 (0)