[None] [test] Add B300 cases to CI (#8056)

VALLIS-NERIA · web-flow · commit 9298f1bdcc73 · 2025-10-06T19:23:31.000-07:00
Signed-off-by: Xiwen Yu &lt;13230610+VALLIS-NERIA@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp
@@ -36,6 +36,25 @@ using namespace batchedGemm::trtllm::gen;
 
 static BatchedGemmInterface::ModuleCache globalTrtllmGenBatchedGemmModuleCache;
 
+constexpr bool isSMCompatible(int gpuSM, SmVersion kernelSM)
+{
+    if (gpuSM == 103)
+    {
+        return kernelSM == SmVersion::Sm100f || kernelSM == SmVersion::Sm103a;
+    }
+    else if (gpuSM == 100)
+    {
+        return kernelSM == SmVersion::Sm100f || kernelSM == SmVersion::Sm100a;
+    }
+    else if (gpuSM == 90)
+    {
+        return kernelSM == SmVersion::Sm90a;
+    }
+
+    TLLM_THROW("Unexpected gpuSM %d", gpuSM);
+    return false;
+}
+
 std::vector<int64_t> prioritizePredefinedConfigs(int m, int n, int k, std::vector<int64_t> const& sortedIndices,
     batchedGemm::batchedGemm::BatchedGemmConfig const* configs)
 {
@@ -98,6 +117,7 @@ TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner(TrtllmGenBatchedGemmRunne
 
     mPassingConfigIndices.clear();
 
+    int gpuSM = tensorrt_llm::common::getSMVersion();
     for (size_t i = 0; i < bmm.getNumBatchedGemmConfigs(); ++i)
     {
         auto const options = configs[i].mOptions;
@@ -108,7 +128,7 @@ TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner(TrtllmGenBatchedGemmRunne
             && options.mTransposeMmaOutput == mOptions.transposeMmaOutput
             && (!doesRouteImplUseNoRoute(options.mRouteImpl)) == mOptions.routeAct
             && options.mFusedAct == mOptions.fusedAct && options.mIsStaticBatch == mOptions.staticBatch
-            && tileSize == mOptions.tileSize)
+            && tileSize == mOptions.tileSize && isSMCompatible(gpuSM, configs[i].mSm))
         {
             auto sm = configs[i].mSm;
             if (sm != SmVersion::Sm100f)
diff --git a/tests/integration/test_lists/test-db/l0_b300.yml b/tests/integration/test_lists/test-db/l0_b300.yml
@@ -15,4 +15,10 @@ l0_b300:
       backend: pytorch
   tests:
   # ------------- PyTorch tests ---------------
+  - unittest/_torch/attention # 200s
+  # - unittest/_torch/thop
+  - unittest/_torch/executor # 250s
+  - unittest/_torch/modules # 300s
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
diff --git a/tests/integration/test_lists/test-db/l0_gb300.yml b/tests/integration/test_lists/test-db/l0_gb300.yml
@@ -16,3 +16,6 @@ l0_gb300:
   tests:
   # ------------- PyTorch tests ---------------
   - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto]
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency] # Cover nvbugs 5461712 and 5505402
+  - unittest/_torch/thop/parallel TIMEOUT (90)