Made sparse amdgcn structs depict the actual builtin signature (halved A vector size)

chris-tsiaousis-hpc · chris-tsiaousis-hpc · commit 55c08fe07190 · 2026-03-17T08:59:03.000Z
Signed-off-by: Chris Tsiaousis &lt;chris.tsiaousis@streamhpc.com&gt;
diff --git a/projects/composablekernel/include/ck_tile/core/arch/mma/sparse/mfma/sparse_gfx9.hpp b/projects/composablekernel/include/ck_tile/core/arch/mma/sparse/mfma/sparse_gfx9.hpp
@@ -40,10 +40,8 @@ struct amdgcn_mma<
     using OpType                          = MfmaOp;
     static constexpr MmaOpFamily OpFamily = MmaOpFamily::SPARSE;
 
-    static constexpr index_t ABVecN = 8;
-
-    using AVecType = ext_vector_t<fp16_t, ABVecN>;
-    using BVecType = ext_vector_t<fp16_t, ABVecN>;
+    using AVecType = ext_vector_t<fp16_t, 4>;
+    using BVecType = ext_vector_t<fp16_t, 8>;
     using CVecType = ext_vector_t<fp32_t, 4>;
 
     static constexpr index_t kAMBlock = 1;
@@ -62,22 +60,12 @@ struct amdgcn_mma<
     static constexpr index_t kCompressionRatio = 2;
 
     CK_TILE_DEVICE static auto
-    exec(AVecType& aVec, BVecType const& bVec, CVecType const& cVec) -> CVecType
+    exec(AVecType const& aVec, BVecType const& bVec, CVecType const& cVec, int32_t idx) -> CVecType
     {
-        static constexpr index_t CompressedSize = ABVecN / kCompressionRatio;
-        using AVecCompressed                    = ext_vector_t<fp16_t, CompressedSize>;
-        static_assert(CompressedSize == 4);
-        // TODO: Compressing A on-the-fly should be OK for now, but we need to validate
-        // and evaluate changing this to a transform at a higher level.
-        // aVec not being const can cause problems when running multiple intrinsics.
-        const int32_t idx = ck_tile::compress_a_impl<fp16_t, CompressedSize>(aVec);
-
-        const AVecCompressed a_vec_pruned = {aVec[0], aVec[1], aVec[2], aVec[3]};
-
         using namespace sparse::detail;
         static constexpr BuiltinParams PARAMS = getBuiltinParams<CtrlFlags::CompressionIndex>();
         return {__builtin_amdgcn_smfmac_f32_16x16x32_f16(
-            a_vec_pruned, bVec, cVec, idx, PARAMS.UseFirstIndex, PARAMS.ByteIndexToOverride)};
+            aVec, bVec, cVec, idx, PARAMS.UseFirstIndex, PARAMS.ByteIndexToOverride)};
     }
 };
 
diff --git a/projects/composablekernel/include/ck_tile/core/arch/mma/sparse/wmma/sparse_gfx12.hpp b/projects/composablekernel/include/ck_tile/core/arch/mma/sparse/wmma/sparse_gfx12.hpp
@@ -29,10 +29,8 @@ struct amdgcn_mma<fp16_t,
     using OpType                          = WmmaOp;
     static constexpr MmaOpFamily OpFamily = MmaOpFamily::SPARSE;
 
-    static constexpr index_t ABVecN = 16;
-
-    using AVecType = ext_vector_t<fp16_t, ABVecN>;
-    using BVecType = ext_vector_t<fp16_t, ABVecN>;
+    using AVecType = ext_vector_t<fp16_t, 8>;
+    using BVecType = ext_vector_t<fp16_t, 16>;
     using CVecType = ext_vector_t<fp32_t, 8>;
 
     static constexpr index_t kAMBlock = 1;
@@ -51,20 +49,9 @@ struct amdgcn_mma<fp16_t,
     static constexpr index_t kCompressionRatio = 2;
 
     CK_TILE_DEVICE static auto
-    exec(AVecType& aVec, BVecType const& bVec, CVecType const& cVec) -> CVecType
+    exec(AVecType const& aVec, BVecType const& bVec, CVecType const& cVec, int32_t idx) -> CVecType
     {
-        static constexpr index_t CompressedSize = ABVecN / kCompressionRatio;
-        using AVecCompressed                    = ext_vector_t<fp16_t, CompressedSize>;
-        static_assert(CompressedSize == 8);
-        // TODO: Compressing A on-the-fly should be OK for now, but we need to validate
-        // and evaluate changing this to a transform at a higher level.
-        // aVec not being const can cause problems when running multiple intrinsics.
-        const int32_t idx = ::ck_tile::compress_a_impl<fp16_t, CompressedSize>(aVec);
-
-        const AVecCompressed a_vec_pruned = {
-            aVec[0], aVec[1], aVec[2], aVec[3], aVec[4], aVec[5], aVec[6], aVec[7]};
-
-        return {__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(a_vec_pruned, bVec, cVec, idx)};
+        return {__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(aVec, bVec, cVec, idx)};
     }
 };