flashinfer-ai
diff --git a/‎flashinfer/gemm.py‎
Lines changed: 5 additions & 0 deletions b/‎flashinfer/gemm.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎scripts/task_jit_run_tests_part1.sh‎
Lines changed: 1 addition & 1 deletion b/‎scripts/task_jit_run_tests_part1.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/task_jit_run_tests_part4.sh‎
Lines changed: 1 addition & 1 deletion b/‎scripts/task_jit_run_tests_part4.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/GEMM/__init__.py‎ renamed to ‎tests/gemm/__init__.py‎ b/‎tests/GEMM/__init__.py‎ renamed to ‎tests/gemm/__init__.py‎
diff --git a/‎tests/GEMM/test_bmm_fp8.py‎ renamed to ‎tests/gemm/test_bmm_fp8.py‎ b/‎tests/GEMM/test_bmm_fp8.py‎ renamed to ‎tests/gemm/test_bmm_fp8.py‎
diff --git a/‎tests/GEMM/test_cute_dsl_blockscaled_gemm.py‎ renamed to ‎tests/gemm/test_cute_dsl_blockscaled_gemm.py‎ b/‎tests/GEMM/test_cute_dsl_blockscaled_gemm.py‎ renamed to ‎tests/gemm/test_cute_dsl_blockscaled_gemm.py‎
diff --git a/‎tests/GEMM/test_cute_dsl_gemm_allreduce_two_shot.py‎ renamed to ‎tests/gemm/test_cute_dsl_gemm_allreduce_two_shot.py‎ b/‎tests/GEMM/test_cute_dsl_gemm_allreduce_two_shot.py‎ renamed to ‎tests/gemm/test_cute_dsl_gemm_allreduce_two_shot.py‎
diff --git a/‎tests/GEMM/test_group_gemm.py‎ renamed to ‎tests/gemm/test_group_gemm.py‎ b/‎tests/GEMM/test_group_gemm.py‎ renamed to ‎tests/gemm/test_group_gemm.py‎
diff --git a/‎tests/GEMM/test_groupwise_scaled_gemm_fp8.py‎ renamed to ‎tests/gemm/test_groupwise_scaled_gemm_fp8.py‎
Lines changed: 6 additions & 0 deletions b/‎tests/GEMM/test_groupwise_scaled_gemm_fp8.py‎ renamed to ‎tests/gemm/test_groupwise_scaled_gemm_fp8.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎tests/GEMM/test_groupwise_scaled_gemm_mxfp4.py‎ renamed to ‎tests/gemm/test_groupwise_scaled_gemm_mxfp4.py‎ b/‎tests/GEMM/test_groupwise_scaled_gemm_mxfp4.py‎ renamed to ‎tests/gemm/test_groupwise_scaled_gemm_mxfp4.py‎
@@ -2304,6 +2304,11 @@ def group_gemm_fp8_nt_groupwise(
         assert out.dtype == out_dtype
 
     if is_sm120a_supported(a.device) or is_sm121a_supported(a.device):
+        # it has correctness issues for num_groups > 1
+        if num_groups > 1:
+            raise RuntimeError(
+                "group_gemm_fp8_nt_groupwise has correctness issues for num_groups > 1 on SM120/121"
+            )
         # SM120/121 doesn't use mma_sm parameter
         get_gemm_sm120_module().group_gemm_fp8_nt_groupwise(
             int_workspace_buffer,
 
@@ -7,7 +7,7 @@ set -x
 
 pip install -e . -v
 
-# pytest -s tests/GEMM/test_group_gemm.py
+# pytest -s tests/gemm/test_group_gemm.py
 pytest -s tests/attention/test_logits_cap.py
 pytest -s tests/attention/test_sliding_window.py
 pytest -s tests/attention/test_tensor_cores_decode.py
 
@@ -9,7 +9,7 @@ pip install -e . -v
 
 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True  # avoid memory fragmentation
 pytest -s tests/attention/test_deepseek_mla.py
-pytest -s tests/GEMM/test_group_gemm.py
+pytest -s tests/gemm/test_group_gemm.py
 pytest -s tests/attention/test_batch_prefill_kernels.py
 # NOTE(Zihao): need to fix tile size on KV dimension for head_dim=256 on small shared memory architecture (sm89)
 # pytest -s tests/attention/test_batch_attention.py
@@ -146,6 +146,12 @@ def test_fp8_groupwise_group_gemm(
     scale_major_mode,
     out_dtype,
 ):
+    if group_size > 1 and torch.cuda.get_device_capability()[0] in [
+        12,
+    ]:
+        pytest.skip(
+            "group_gemm_fp8_nt_groupwise has correctness issues for num_groups > 1 on SM120/121"
+        )
     torch.random.manual_seed(0)
     tile_size = 128