test: FIX test_ptp_quickstart_advanced_deepseek_v3_2nodes_8gpus (#4283)

xinhe-nv · web-flow · commit 14bfb5e0d6e8 · 2025-05-15T15:57:44.000+08:00
* update test_ptp_quickstart_advanced_deepseek_v3_2nodes_8gpus

Signed-off-by: xinhe-nv &lt;200704525+xinhe-nv@users.noreply.github.com&gt;

* skip llava-v1.6-mistral-7b-hf-vision-trtllm on L40S

Signed-off-by: xinhe-nv &lt;200704525+xinhe-nv@users.noreply.github.com&gt;

---------

Signed-off-by: xinhe-nv &lt;200704525+xinhe-nv@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/examples/test_multimodal.py b/tests/integration/defs/examples/test_multimodal.py
@@ -611,7 +611,9 @@ def _test_llm_multimodal_general(llm_venv,
     'blip2-flan-t5-xl',
     'llava-1.5-7b-hf',
     'llava-v1.6-mistral-7b-hf',
-    'llava-v1.6-mistral-7b-hf-vision-trtllm',
+    pytest.param('llava-v1.6-mistral-7b-hf-vision-trtllm',
+                 marks=pytest.mark.skipif(get_device_memory() < 50000,
+                                          reason="Skip due to low memory")),
     'llava-onevision-qwen2-7b-ov-hf',
     'llava-onevision-qwen2-7b-ov-hf-video',
     'nougat-base',
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
@@ -1349,24 +1349,22 @@ def test_ptp_quickstart_advanced_deepseek_v3_2nodes_8gpus(
     # "RCCA https://nvbugs/5163844"
     print(f"Testing {model_name}.")
     example_root = Path(os.path.join(llm_root, "examples", "pytorch"))
-    with tempfile.NamedTemporaryFile(mode='w+t',
-                                     suffix=f".{model_name}.log",
-                                     dir="./",
-                                     delete=True,
-                                     delete_on_close=True) as running_log:
-        llm_venv.run_cmd([
-            str(example_root / "quickstart_advanced.py"),
-            "--model_dir",
-            f"{llm_models_root()}/{model_path}",
-            "--moe_ep_size=8",
-            "--tp_size=16",
-            "--use_cuda_graph",
-            f"--kv_cache_fraction={_MEM_FRACTION_50}",
-            "--max_batch_size=32",
-            "--max_num_tokens=2048",
-        ],
-                         running_log=running_log)
-        # _check_mem_usage(running_log, [56.30, 0, 0, 0])
+    run_cmd = [
+        "trtllm-llmapi-launch",
+        "python3",
+        str(example_root / "quickstart_advanced.py"),
+        "--enable_overlap_scheduler",
+        "--model_dir",
+        f"{llm_models_root()}/{model_path}",
+        "--moe_ep_size=8",
+        "--tp_size=16",
+        "--use_cuda_graph",
+        f"--kv_cache_fraction={_MEM_FRACTION_50}",
+        "--max_batch_size=32",
+        "--max_num_tokens=2048",
+        "--disable_kv_cache_reuse",
+    ]
+    check_call(" ".join(run_cmd), shell=True, env=llm_venv._new_env)
 
 
 @pytest.mark.parametrize("model_name,model_path,eagle_model_path", [
diff --git a/tests/integration/test_lists/qa/llm_multinodes_function_test.txt b/tests/integration/test_lists/qa/llm_multinodes_function_test.txt
@@ -1,15 +1,5 @@
 examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-8b-disable_fp8-tp16pp1-build]
 examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-8b-disable_fp8-tp16pp1-infer]
-examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-70b-disable_fp8-tp8pp2-build]
-examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-70b-disable_fp8-tp8pp2-infer]
-examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-70b-enable_fp8-tp8pp2-build]
-examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-70b-enable_fp8-tp8pp2-infer]
-examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-405b-enable_fp8-tp8pp2-build]
-examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-405b-enable_fp8-tp8pp2-infer]
-examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-405b-disable_fp8-tp8pp2-build]
-examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-405b-disable_fp8-tp8pp2-infer]
-examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-405b-fp8-disable_fp8-tp8pp2-build]
-examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-405b-fp8-disable_fp8-tp8pp2-infer]
 examples/test_mixtral.py::test_llm_mixtral_2nodes_8gpus[Mixtral-8x22B-v0.1-plugin-renormalize-tensor_parallel-build]
 examples/test_mixtral.py::test_llm_mixtral_2nodes_8gpus[Mixtral-8x22B-v0.1-plugin-renormalize-tensor_parallel-infer]
 test_e2e.py::test_ptp_quickstart_advanced_deepseek_v3_2nodes_8gpus[DeepSeek-V3-DeepSeek-V3]