[tuner] update the calculation of shared memory usage

bangtianliu · bangtianliu · commit 24ffead67cec · 2025-11-26T15:49:43.000-08:00
Signed-off-by: Bangtian Liu &lt;liubangtian@gmail.com&gt;
diff --git a/amdsharktuner/amdsharktuner/constraint_generator.py b/amdsharktuner/amdsharktuner/constraint_generator.py
@@ -244,9 +244,7 @@ def set_cdim_tile_sizes(tile_sizes, contraction_dims, csizes):
         promote_operands = [0, 1]
         padding = None
         if required_padding:
-            # TODO: Remove promotion of operand 2 once codegen supports handling padded outputs without promotion.
-            promote_operands = [0, 1, 2]
-            _, _, mma_intrinsic_k = mma_attr.mnk_shape
+            mma_intrinsic_k = mma_attr.mnk_shape[2]
             padding = [
                 *(workgroup_tile_sizes[d] for d in contraction_dims.m),
                 *(workgroup_tile_sizes[d] for d in contraction_dims.n),
diff --git a/amdsharktuner/amdsharktuner/dispatch_constraints.py b/amdsharktuner/amdsharktuner/dispatch_constraints.py
@@ -161,17 +161,37 @@ def get_dispatch_constraints(
 def calculate_shared_memory_usage_in_bytes(
     lhs_type: common.ShapedType,
     rhs_type: common.ShapedType,
+    res_type: common.ShapedType,
     m: list[int] | list[z3.ArithRef],
     n: list[int] | list[z3.ArithRef],
     k: list[int] | list[z3.ArithRef],
+    promote_operands: list[int] = [0, 1],
 ) -> int | z3.ArithRef:
+    assert promote_operands == [0, 1] or promote_operands == [
+        0,
+        1,
+        2,
+    ], f"Got {promote_operands}"
+
     lhs_memory = lhs_type.bitwidth // 8
     for size in m + k:
         lhs_memory *= size
+
     rhs_memory = rhs_type.bitwidth // 8
     for size in n + k:
         rhs_memory *= size
-    return lhs_memory + rhs_memory
+
+    output_memory = res_type.bitwidth // 8
+    for size in m + n:
+        output_memory *= size
+
+    total_memory = (
+        int(0 in promote_operands) * lhs_memory
+        + int(1 in promote_operands) * rhs_memory
+        + int(2 in promote_operands) * output_memory
+    )
+
+    return total_memory
 
 
 def generate_vector_distribute_constraints(
@@ -258,7 +278,7 @@ def generate_vector_distribute_constraints(
         constraints += [subgroups >= 1, subgroups <= 10]
 
     shared_memory = calculate_shared_memory_usage_in_bytes(
-        lhs_type, rhs_type, [m], [n], [k]
+        lhs_type, rhs_type, res_type, [m], [n], [k]
     )
     constraints += [shared_memory <= gpu_target_info.max_workgroup_memory_bytes]
 
@@ -360,7 +380,7 @@ def generate_tile_and_fuse_constraints(
     constraints += [wg_threads == subgroups * subgroup_size]
 
     shared_memory = calculate_shared_memory_usage_in_bytes(
-        lhs_type, rhs_type, m_tiles, n_tiles, k_tiles
+        lhs_type, rhs_type, res_type, m_tiles, n_tiles, k_tiles
     )
     constraints += [
         shared_memory * intrinsic_k <= gpu_target_info.max_workgroup_memory_bytes
diff --git a/amdsharktuner/tests/constraint_generator_test.py b/amdsharktuner/tests/constraint_generator_test.py
@@ -296,7 +296,7 @@ def test_generate_solutions_tile_and_fuse_contraction_padding(
                 lowering_config
             ), f"Missing padding in lowering config: {lowering_config}"
             promote = [int(x) for x in lowering_config.attributes["promote_operands"]]
-            assert promote == [0, 1, 2]
+            assert promote == [0, 1]
 
 
 def test_generate_solutions_tile_and_fuse_conv_padding(
@@ -373,7 +373,7 @@ def test_generate_solutions_tile_and_fuse_conv_padding(
                 lowering_config
             ), f"Missing padding in lowering config: {lowering_config}"
             promote = [int(x) for x in lowering_config.attributes["promote_operands"]]
-            assert promote == [0, 1, 2]
+            assert promote == [0, 1]
 
 
 def test_adjust_problem_size_for_pipeline(
diff --git a/amdsharktuner/tests/dispatch_constraints_test.py b/amdsharktuner/tests/dispatch_constraints_test.py
@@ -43,36 +43,57 @@ def gpu_target_info(tuner_ctx: common.TunerContext) -> iree_gpu.TargetInfo:
 def test_calculate_shared_memory_usage_in_bytes(tuner_ctx: common.TunerContext) -> None:
     lhs_type = common.ShapedType([1024, 1024], tuner_ctx.type.f16)
     rhs_type = common.ShapedType([1024, 1024], tuner_ctx.type.f16)
+    res_type = common.ShapedType([1024, 1024], tuner_ctx.type.f32)
     assert (
         dispatch_constraints.calculate_shared_memory_usage_in_bytes(
-            rhs_type, rhs_type, [512], [64], [128]
+            lhs_type, rhs_type, res_type, [512], [64], [128]
         )
         == 147456
     )
 
     lhs_type = common.ShapedType([1024, 1024], tuner_ctx.type.i8)
     assert (
         dispatch_constraints.calculate_shared_memory_usage_in_bytes(
-            lhs_type, rhs_type, [512], [64], [128]
+            lhs_type, rhs_type, res_type, [512], [64], [128]
         )
         == 81920
     )
 
     rhs_type = common.ShapedType([1024, 1024], tuner_ctx.type.i32)
     assert (
         dispatch_constraints.calculate_shared_memory_usage_in_bytes(
-            lhs_type, rhs_type, [128], [64], [32]
+            lhs_type, rhs_type, res_type, [128], [64], [32]
         )
         == 12288
     )
 
     assert (
         dispatch_constraints.calculate_shared_memory_usage_in_bytes(
-            lhs_type, rhs_type, [2, 64], [4, 16], [8, 4]
+            lhs_type, rhs_type, res_type, [2, 64], [4, 16], [8, 4]
         )
         == 12288
     )
 
+    lhs_type = common.ShapedType([1024, 1024], tuner_ctx.type.f16)
+    rhs_type = common.ShapedType([1024, 1024], tuner_ctx.type.f16)
+    res_type = common.ShapedType([1024, 1024], tuner_ctx.type.f32)
+    assert (
+        dispatch_constraints.calculate_shared_memory_usage_in_bytes(
+            lhs_type, rhs_type, res_type, [512], [64], [128], promote_operands=[0, 1, 2]
+        )
+        == 278528
+    )
+
+    with pytest.raises(AssertionError):
+        dispatch_constraints.calculate_shared_memory_usage_in_bytes(
+            lhs_type, rhs_type, res_type, [512], [64], [128], promote_operands=[0]
+        )
+
+    with pytest.raises(AssertionError):
+        dispatch_constraints.calculate_shared_memory_usage_in_bytes(
+            lhs_type, rhs_type, res_type, [512], [64], [128], promote_operands=[1, 2]
+        )
+
 
 def test_generate_tile_and_fuse_constraints_valid_input(
     tuner_ctx: common.TunerContext, gpu_target_info: iree_gpu.TargetInfo