clean up

Amir-19 · Amir-19 · commit 415da394d851 · 2025-09-30T23:45:13.000-07:00
diff --git a/tests/test_cute_dsl_blockscaled_gemm_allreduce_two_shot.py b/tests/test_cute_dsl_blockscaled_gemm_allreduce_two_shot.py
@@ -86,21 +86,6 @@ def create_barrier_flags(m, n, l, mma_tiler_mn, cluster_shape_mn, sm_count):
     barrier_size = Sm100BlockScaledPersistentDenseGemmKernel.compute_barrier_flag_size(
         m, n, l, mma_tiler_mn, cluster_shape_mn, sm_count
     )
-    #print("LOOK HERE", (barrier_size,))
-    # NOTE: use_2cta_instrs from blockedscaled_gemm logic
-
-    # use_2cta_instrs = mma_tiler_mn[0] == 256
-    # cta_tile_shape_mn = (
-    #     mma_tiler_mn[0] // (2 if use_2cta_instrs else 1),
-    #     mma_tiler_mn[1],
-    # )
-    # problem_shape_ntile_mn = (m // cta_tile_shape_mn[0], n // cta_tile_shape_mn[1])
-    # num_tiles_per_batch = problem_shape_ntile_mn[0] * problem_shape_ntile_mn[1]
-    # num_tiles = num_tiles_per_batch * l
-    # num_sms = torch.cuda.get_device_properties("cuda").multi_processor_count
-    # +num_sms for final barrier
-    # num_tiles + num_sms
-
     barrier_flag = symm_mem.empty((barrier_size,), device="cuda", dtype=torch.int32)
 
     barrier_flag.fill_(0)
@@ -158,8 +143,6 @@ def run_blockscaled_gemm_all_reduce_python_interface(
     l, m = lm
     k, n = kn
 
-    #print(f"device: {device}")
-
     if not Sm100BlockScaledPersistentDenseGemmKernel.can_implement(
         get_cutlass_dtype(ab_dtype),
         get_cutlass_dtype(sf_dtype),
@@ -201,7 +184,6 @@ def run_blockscaled_gemm_all_reduce_python_interface(
         init_type=cutlass_torch.TensorInitType.SCALAR,
         init_config=cutlass_torch.ScalarInitConfig(value=0.0),
     )
-    #print(f"Rank {rank}: c_ref INITIAL shape={c_ref.shape}, stride={c_ref.stride()}")
     a_tensor, a_torch = cutlass_torch.cute_tensor_like(
         a_ref,
         get_cutlass_dtype(ab_dtype),
@@ -214,21 +196,12 @@ def run_blockscaled_gemm_all_reduce_python_interface(
         is_dynamic_layout=True,
         assumed_align=16,
     )
-    # c_tensor, c_torch = cutlass_torch.cute_tensor_like(
-    #     c_ref,
-    #     get_cutlass_dtype(c_dtype),
-    #     is_dynamic_layout=True,
-    #     assumed_align=16,
-    # )
     c_tensor, c_tensor_mc, c_torch, c_torch_mc = create_mc_tensor(
         c_ref,
         get_cutlass_dtype(c_dtype),
         # (1 if c_major == "n" else 0),
         is_dynamic_layout=True,
     )
-    # print(
-    #     f"Rank {rank}: c_torch INITIAL shape={c_torch.shape}, stride={c_torch.stride()}"
-    # )
     alpha_tensor = (
         torch.randn(l, dtype=torch.float32, device=device) if fuse_alpha else None
     )
@@ -271,15 +244,11 @@ def run_blockscaled_gemm_all_reduce_python_interface(
     sfb_ref, sfb_tensor, sfb_torch = create_scale_factor_tensor(
         l, n, k, sf_vec_size, get_cutlass_dtype(sf_dtype), device
     )
-    # masked_m_tensor = torch.randint(0, m, (l,), dtype=torch.int32, device=device)
     if rank == 0:
         masked_m_tensor = torch.randint(0, m, (l,), dtype=torch.int32, device=device)
     else:
         masked_m_tensor = torch.empty((l,), dtype=torch.int32, device=device)
     torch.distributed.broadcast(masked_m_tensor, src=0)
-    # to hack and test:
-    # masked_m_tensor = torch.full((l,), m, dtype=torch.int32, device=device)
-    # print(f"Rank {rank}: masked_m = {masked_m_tensor}")
     for _ in range(iterations):
         dst_signals = (
             torch.zeros((l,), dtype=torch.uint32, device="cuda")
@@ -328,18 +297,12 @@ def run_blockscaled_gemm_all_reduce_python_interface(
     )
     # Convert c back to f32 for comparison.
     ref = ref.permute(2, 0, 1).contiguous().permute(1, 2, 0)
-    # print(f"Rank {rank}: c_ref shape={c_ref.shape}, stride={c_ref.stride()}")
-    # print(f"Rank {rank}: ref shape={ref.shape}, stride={ref.stride()}")
-    # print(f"Rank {rank}: c_torch shape={c_torch.shape}, stride={c_torch.stride()}")
     cute.testing.convert(
         c_tensor,
         from_dlpack(c_ref, assumed_align=16).mark_layout_dynamic(
             leading_dim=(1 if c_major == "n" else 0)
         ),
     )
-    # print(f"Rank {rank}: c_ref shape={c_ref.shape}, stride={c_ref.stride()}")
-    # print(f"Rank {rank}: ref shape={ref.shape}, stride={ref.stride()}")
-    # print(f"Rank {rank}: c_torch shape={c_torch.shape}, stride={c_torch.stride()}")
     if c_dtype in ("float32", "float16", "bfloat16"):
         for i in range(l):
             # skip testing c_ref & ref
@@ -481,23 +444,23 @@ def multi_process_parallel(
 @pytest.mark.parametrize(
     "ab_dtype,sf_dtype,c_dtype,sf_vec_size",
     [
-        # ("float8_e5m2", "float8_e8m0fnu", "bfloat16", 32)
-        # ("float4_e2m1fn", "float8_e8m0fnu", "float16", 16),
-        # ("float4_e2m1fn", "float8_e8m0fnu", "bfloat16", 16),
-        # ("float4_e2m1fn", "float8_e8m0fnu", "float32", 16),
-        # ("float4_e2m1fn", "float8_e4m3fn", "float16", 16),
-        # ("float4_e2m1fn", "float8_e4m3fn", "bfloat16", 16),
-        # ("float4_e2m1fn", "float8_e4m3fn", "float32", 16),
-        # ("float8_e4m3fn", "float8_e8m0fnu", "bfloat16", 32),
-        # ("float8_e4m3fn", "float8_e8m0fnu", "float16", 32),
-        # ("float8_e4m3fn", "float8_e8m0fnu", "float32", 32),
-        ("float8_e4m3fn", "float8_e8m0fnu", "float8_e4m3fn", 32),
-        # ("float8_e4m3fn", "float8_e8m0fnu", "float8_e5m2", 32),
-        # ("float8_e5m2", "float8_e8m0fnu", "bfloat16", 32),
-        # ("float8_e5m2", "float8_e8m0fnu", "float16", 32),
-        # ("float8_e5m2", "float8_e8m0fnu", "float32", 32),
+        ("float8_e5m2", "float8_e8m0fnu", "bfloat16", 32),
+        ("float4_e2m1fn", "float8_e8m0fnu", "float16", 16),
+        ("float4_e2m1fn", "float8_e8m0fnu", "bfloat16", 16),
+        ("float4_e2m1fn", "float8_e8m0fnu", "float32", 16),
+        ("float4_e2m1fn", "float8_e4m3fn", "float16", 16),
+        ("float4_e2m1fn", "float8_e4m3fn", "bfloat16", 16),
+        ("float4_e2m1fn", "float8_e4m3fn", "float32", 16),
+        ("float8_e4m3fn", "float8_e8m0fnu", "bfloat16", 32),
+        ("float8_e4m3fn", "float8_e8m0fnu", "float16", 32),
+        ("float8_e4m3fn", "float8_e8m0fnu", "float32", 32),
+        # ("float8_e4m3fn", "float8_e8m0fnu", "float8_e4m3fn", 32),
+        ("float8_e4m3fn", "float8_e8m0fnu", "float8_e5m2", 32),
+        ("float8_e5m2", "float8_e8m0fnu", "bfloat16", 32),
+        ("float8_e5m2", "float8_e8m0fnu", "float16", 32),
+        ("float8_e5m2", "float8_e8m0fnu", "float32", 32),
         # ("float8_e5m2", "float8_e8m0fnu", "float8_e4m3fn", 32),
-        # ("float8_e5m2", "float8_e8m0fnu", "float8_e5m2", 32),
+        ("float8_e5m2", "float8_e8m0fnu", "float8_e5m2", 32),
     ],
 )
 @pytest.mark.parametrize("a_major", ["k"])
@@ -538,7 +501,6 @@ def test_cute_dsl_blockscaled_gemm_allreduce_two_shot(
         pytest.skip(
             f"world_size {world_size} is greater than available_gpus {available_gpus}"
         )
-    #device = torch.device("cuda", rank)
     major, minor = torch.cuda.get_device_capability(torch.device("cuda:0"))
     if not (major == 10 and minor == 0):
         pytest.skip("Cute-dsl backend is only supported on SM100.")