infra: Add test list name check (NVIDIA#3097)

EmmaQiaoCh · chzblych · web-flow · commit 48db263d9a0f · 2025-04-20T23:02:16.000+08:00
* Add steps to check test names

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Correct test-db command

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Switch to use a trt-llm image

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Update go path

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Correct go path

Signed-off-by: qqiao &lt;qqiao@nvidia.com&gt;
Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Move the test list check to test ci

Signed-off-by: qqiao &lt;qqiao@nvidia.com&gt;
Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Correct file path

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Fix path again

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Fix get path

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Fix typo

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Skip test list check for ARM

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Fix expression

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Change back unrelated file

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Correct qa test names

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Remove a stage

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Update jenkins/L0_Test.groovy

Co-authored-by: Yanchao Lu &lt;yanchaol@nvidia.com&gt;
Signed-off-by: Emma Qiao &lt;qqiao@nvidia.com&gt;
Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Move some steps to a python script

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Fix script path

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Split commands and debug

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Fix typo

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Fix typo

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Also correct case name in waives list

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Move check script to another folder

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Update qa list after rebase

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Fix rebase

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Remove the perf tests under QA

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

* Some tests already fixed after rebase to TOT

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;

---------

Signed-off-by: EmmaQiaoCh &lt;qqiao@nvidia.com&gt;
Signed-off-by: qqiao &lt;qqiao@nvidia.com&gt;
Signed-off-by: Emma Qiao &lt;qqiao@nvidia.com&gt;
Co-authored-by: Yanchao Lu &lt;yanchaol@nvidia.com&gt;
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
@@ -473,6 +473,32 @@ def runLLMDocBuild(pipeline, config)
     )
 }
 
+def launchTestListCheck(pipeline)
+{
+    stageName = "Test List Check"
+    trtllm_utils.launchKubernetesPod(pipeline, createKubernetesPodConfig(LLM_DOCKER_IMAGE, "a10"), "trt-llm", {
+        try {
+            echoNodeAndGpuInfo(pipeline, stageName)
+            trtllm_utils.llmExecStepWithRetry(pipeline, script: """apt-get update && apt-get install \
+            libffi-dev \
+            -y""")
+            sh "nvidia-smi -q"
+            // download TRT-LLM tarfile
+            def tarName = BUILD_CONFIGS[VANILLA_CONFIG][TARNAME]
+            def llmTarfile = "https://urm.nvidia.com/artifactory/${ARTIFACT_PATH}/${tarName}"
+            trtllm_utils.llmExecStepWithRetry(pipeline, script: "pwd && wget -nv ${llmTarfile} && ls -alh")
+            sh "tar -zxf ${tarName}"
+            def llmPath = sh (script: "realpath .", returnStdout: true).trim()
+            def llmSrc = "${llmPath}/TensorRT-LLM/src"
+            sh "python3 ${llmSrc}/scripts/check_test_list.py --l0 --qa"
+        } catch (InterruptedException e) {
+            throw e
+        } catch (Exception e) {
+            throw e
+        }
+    })
+}
+
 def generateStageFailTestResultXml(stageName, subName, failureLog, resultPath) {
     String resultFiles = sh(script: "cd ${stageName} && ls -l ${resultPath} | wc -l", returnStdout: true).trim()
     echo "${resultFiles}"
@@ -1566,6 +1592,20 @@ pipeline {
                 }
             }
         }
+        stage("Check Test Lists")
+        {
+            when {
+                expression {
+                    env.targetArch == X86_64_TRIPLE  // Only execute the check if running on x86
+                }
+            }
+            steps
+            {
+                script {
+                    launchTestListCheck(this)
+                }
+            }
+        }
         stage("Test") {
             steps {
                 script {
diff --git a/scripts/check_test_list.py b/scripts/check_test_list.py
@@ -0,0 +1,80 @@
+import argparse
+import os
+import subprocess
+
+
+def install_python_dependencies(llm_src):
+    subprocess.run(
+        f"cd {llm_src} && pip3 install --retries 1 -r requirements-dev.txt",
+        shell=True,
+        check=True)
+    subprocess.run(
+        f"pip3 install --force-reinstall --no-deps {llm_src}/../tensorrt_llm-*.whl",
+        shell=True,
+        check=True)
+    subprocess.run(
+        "pip3 install --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/sw-tensorrt-pypi/simple "
+        "--ignore-installed trt-test-db==1.8.5+bc6df7",
+        shell=True,
+        check=True)
+
+
+def verify_l0_test_lists(llm_src):
+    test_db_path = f"{llm_src}/tests/integration/test_lists/test-db"
+    test_list = f"{llm_src}/l0_test.txt"
+
+    # Remove dynamically generated perf tests
+    subprocess.run(f"rm -f {test_db_path}/*perf*", shell=True, check=True)
+    subprocess.run(
+        f"trt-test-db -d {test_db_path} --test-names --output {test_list}",
+        shell=True,
+        check=True)
+
+    subprocess.run(
+        f"cd {llm_src}/tests/integration/defs && "
+        f"pytest --apply-test-list-correction --test-list={test_list} --co -q",
+        shell=True,
+        check=True)
+
+
+def verify_qa_test_lists(llm_src):
+    test_qa_path = f"{llm_src}/tests/integration/test_lists/qa"
+    # Remove dynamically generated perf tests
+    subprocess.run(f"rm -f {test_qa_path}/*perf*", shell=True, check=True)
+    test_def_files = subprocess.check_output(
+        f"ls -d {test_qa_path}/*.txt", shell=True).decode().strip().split('\n')
+    for test_def_file in test_def_files:
+        subprocess.run(
+            f"cd {llm_src}/tests/integration/defs && "
+            f"pytest --apply-test-list-correction --test-list={test_def_file} --co -q",
+            shell=True,
+            check=True)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Check test lists for L0 and QA.")
+    parser.add_argument("--l0",
+                        action="store_true",
+                        help="Enable L0 test list verification.")
+    parser.add_argument("--qa",
+                        action="store_true",
+                        help="Enable QA test list verification.")
+    args = parser.parse_args()
+    llm_src = os.path.realpath("TensorRT-LLM/src")
+
+    install_python_dependencies(llm_src)
+    # Verify L0 test lists
+    if args.l0:
+        verify_l0_test_lists(llm_src)
+    else:
+        print("Skipping L0 test list verification.")
+    # Verify QA test lists
+    if args.qa:
+        verify_qa_test_lists(llm_src)
+    else:
+        print("Skipping QA test list verification.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/integration/defs/test_list_parser.py b/tests/integration/defs/test_list_parser.py
@@ -477,7 +477,10 @@ def suggest_correction(valid_test_buckets, test):
     return ret
 
 
-def apply_test_list_corrections(test_list, corrections, test_prefix=None):
+def apply_test_list_corrections(test_list,
+                                corrections,
+                                items,
+                                test_prefix=None):
     """
     Attempt to correct invalid test names in a test list.
 
@@ -508,7 +511,10 @@ def apply_test_list_corrections(test_list, corrections, test_prefix=None):
     #with open(test_list, "w") as f:
     #    f.write(contents)
 
-    pytest.exit(
+    # Clear the items list to prevent pytest from listing collected tests
+    items.clear()
+
+    raise pytest.UsageError(
         "Exiting early since --apply-test-list-correction was specified.")
 
 
@@ -579,7 +585,8 @@ def parse_and_validate_test_list(
         )
 
         if apply_test_list_correction and corrections:
-            apply_test_list_corrections(test_list, corrections, test_prefix)
+            apply_test_list_corrections(test_list, corrections, items,
+                                        test_prefix)
 
         output_dir = config.getoption("--output-dir")
         if record_invalid_tests and corrections:
diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
@@ -425,12 +425,12 @@ accuracy/test_llm_api.py::TestQwen2_5_7BInstruct::test_fp8
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
-accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[]
+accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized
 accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8
 accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype