[inductor] [cpp] Support vectorization for score and mask in FlexAttention CPU (pytorch#143638)

chunyuan-w · pytorchmergebot · commit 331d5cf56083 · 2025-02-14T05:26:18.000Z
## Description We generate vectorized kernel for score and mask in FlexAttention with this PR. ## Modification The main change include: - For the input and output buffer to the mask and score function, instead of passing scalars, we pass tensors to it. - For the mask function, the original function which works on a scalar only includes the logic of calculating the mask value. The PR added the logic of applying the mark to the qk_data tensor into the graph and then leverage the CPP backend to generate vectorized kernels. The original mask graph: ```python def mask_fn(b, h, q_idx, kv_idx): mask = q_idx >= kv_idx return mask ``` The converted_mask_graph should be: ```python def converted_mask_fn(qk_data, b, h, q_idx, kv_idx): mask = q_idx >= kv_idx qk_data = torch.where(mask, qk_data, torch.full_like(qk_data, -float("inf"))) return qk_data ``` ## Benchmark For q, k, v of shape: `[1, 32, 1024, 128]`, using 40 CPU cores, we observe over 20x speedup compared with the non vectorized version for both `is_causal` = `False` and `True`. ## Test plan The existing FlexAttention UTs (`test/inductor/test_flex_attention.py`, `test/inductor/test_flex_decoding.py`) can cover the change in this PR. ## Output code **Code before this PR is in scalar version:** ```cpp // apply score mod function for (int64_t row = 0; row < cur_qSplitSize; ++row) { for (int64_t col = 0; col < cur_kvSplitSize; col++) { std::vector<int64_t> b_idx = {i}; std::vector<int64_t> h_idx = {j}; std::vector<int64_t> q_idx = {m+row}; int64_t phisical_kv_idx = n+col; if (use_kv_indice) { phisical_kv_idx= *kv_logical_data * kvBlockSize + col; } std::vector<int64_t> kv_idx = {phisical_kv_idx}; accum_t* in_ptr0 = qk_data + row * cur_kvSplitSize + col; auto in_ptr1 = b_idx.data(); auto in_ptr2 = h_idx.data(); auto in_ptr3 = q_idx.data(); auto in_ptr4 = kv_idx.data(); accum_t* out_ptr0 = in_ptr0; { { { auto tmp0 = in_ptr0[static_cast<int64_t>(0L)]; out_ptr0[static_cast<int64_t>(0L)] = tmp0; } } } } } // Apply block mask, fill unused with -inf for (int64_t row = 0; row < cur_qSplitSize; ++row) { for (int64_t col = 0; col < cur_kvSplitSize; col++) { std::vector<int64_t> b_idx = {i}; std::vector<int64_t> h_idx = {j}; std::vector<int64_t> q_idx = {m+row}; int64_t phisical_kv_idx = n+col; if (use_kv_indice) { phisical_kv_idx= *kv_logical_data * kvBlockSize + col; } std::vector<int64_t> kv_idx = {phisical_kv_idx}; accum_t* qk_block = qk_data + row * cur_kvSplitSize + col; auto in_ptr1 = b_idx.data(); auto in_ptr2 = h_idx.data(); auto in_ptr3 = q_idx.data(); auto in_ptr4 = kv_idx.data(); std::vector<int64_t> temp = {0}; int64_t* out_ptr1 = temp.data(); { { { auto tmp0 = static_cast<bool>(true); out_ptr1[static_cast<int64_t>(0L)] = tmp0; } } } *qk_block = *out_ptr1 != 0 ? *qk_block : -std::numeric_limits<accum_t>::infinity(); } } ``` **Code after this PR will be vectorized:** ```cpp accum_t* in_ptr0 = qk_data; auto in_ptr1 = b_idx.data(); auto in_ptr2 = h_idx.data(); auto in_ptr3 = q_idx.data(); auto in_ptr4 = kv_idx.data(); // apply score mod function { accum_t* out_ptr0 = in_ptr0; { #pragma GCC ivdep for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(cur_qSplitSize); x0+=static_cast<int64_t>(1L)) { for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(cur_kvSplitSize); x1+=static_cast<int64_t>(16L)) { { if(C10_LIKELY(x1 >= static_cast<int64_t>(0) && x1 < static_cast<int64_t>(16L*(c10::div_floor_integer(static_cast<int64_t>(cur_kvSplitSize), static_cast<int64_t>(16L)))))) { auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x1 + cur_kvSplitSize*x0), static_cast<int64_t>(16)); tmp0.store(out_ptr0 + static_cast<int64_t>(x1 + cur_kvSplitSize*x0)); } if(C10_UNLIKELY(x1 >= static_cast<int64_t>(16L*(c10::div_floor_integer(static_cast<int64_t>(cur_kvSplitSize), static_cast<int64_t>(16L)))) && x1 < static_cast<int64_t>(cur_kvSplitSize))) { auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x1 + cur_kvSplitSize*x0), static_cast<int64_t>(cur_kvSplitSize + ((-16L)*(c10::div_floor_integer(static_cast<int64_t>(cur_kvSplitSize), static_cast<int64_t>(16L)))))); tmp0.store(out_ptr0 + static_cast<int64_t>(x1 + cur_kvSplitSize*x0), static_cast<int64_t>(cur_kvSplitSize + ((-16L)*(c10::div_floor_integer(static_cast<int64_t>(cur_kvSplitSize), static_cast<int64_t>(16L)))))); } } } } } } // Apply block mask, fill unused with -inf { accum_t* out_ptr1 = in_ptr0; { #pragma GCC ivdep for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(cur_qSplitSize); x0+=static_cast<int64_t>(1L)) { for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(cur_kvSplitSize); x1+=static_cast<int64_t>(16L)) { { if(C10_LIKELY(x1 >= static_cast<int64_t>(0) && x1 < static_cast<int64_t>(16L*(c10::div_floor_integer(static_cast<int64_t>(cur_kvSplitSize), static_cast<int64_t>(16L)))))) { auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x1 + cur_kvSplitSize*x0), static_cast<int64_t>(16)); auto tmp1 = static_cast<bool>(true); auto tmp2 = -std::numeric_limits<float>::infinity(); auto tmp3 = at::vec::VecMask<float,1>::from(tmp1); auto tmp4 = at::vec::Vectorized<float>(tmp2); auto tmp5 = decltype(tmp0)::blendv(tmp4, tmp0, tmp3.template cast<float,1>()); tmp5.store(out_ptr1 + static_cast<int64_t>(x1 + cur_kvSplitSize*x0)); } if(C10_UNLIKELY(x1 >= static_cast<int64_t>(16L*(c10::div_floor_integer(static_cast<int64_t>(cur_kvSplitSize), static_cast<int64_t>(16L)))) && x1 < static_cast<int64_t>(cur_kvSplitSize))) { for (int64_t x1_tail = static_cast<int64_t>(16L*(c10::div_floor_integer(static_cast<int64_t>(cur_kvSplitSize), static_cast<int64_t>(16L))));x1_tail < static_cast<int64_t>(cur_kvSplitSize); x1_tail++) { auto tmp0 = in_ptr0[static_cast<int64_t>(x1_tail + cur_kvSplitSize*x0)]; auto tmp1 = static_cast<bool>(true); auto tmp2 = -std::numeric_limits<float>::infinity(); auto tmp3 = tmp1 ? tmp0 : tmp2; out_ptr1[static_cast<int64_t>(x1_tail + cur_kvSplitSize*x0)] = tmp3; } } } } } } } ``` Pull Request resolved: pytorch#143638 Approved by: https://github.com/jgong5, https://github.com/drisspg, https://github.com/leslie-fang-intel
diff --git a/torch/_inductor/codegen/cpp_flex_attention_template.py b/torch/_inductor/codegen/cpp_flex_attention_template.py
@@ -621,53 +621,45 @@
         {{kernel.kernel_name}}_mul_scale_kernel<accum_t>(qk_data, scaling_factor, cur_qSplitSize*cur_kvSplitSize);
 
 {%- if score_mod and mask_mod %}
-        // TODO: vectorization optimization for below score and mask codegen functions
-        // apply score mod function
-        for (int64_t row = 0; row < cur_qSplitSize; ++row) {
-          for (int64_t col = 0; col < cur_kvSplitSize; col++) {
-            std::vector<int64_t> b_idx = {i};
-            std::vector<int64_t> h_idx = {j};
-            std::vector<int64_t> q_idx = {m+row};
-            int64_t phisical_kv_idx = n+col;
+        // TODO: reduce the number of calls of q_idx and kv_idx initialization
+        std::vector<int64_t> q_idx(cur_qSplitSize);
+        for (int64_t i = 0; i < cur_qSplitSize; ++i) {
+            q_idx[i] = m + i;
+        }
+
+        std::vector<int64_t> kv_idx(cur_kvSplitSize);
+        for (int64_t i = 0; i < cur_kvSplitSize; ++i) {
             if (use_kv_indice) {
-                phisical_kv_idx= *kv_logical_data * kvBlockSize + col;
+                kv_idx[i] = *kv_logical_data * kvBlockSize + i;
+            } else {
+                kv_idx[i] = n + i;
             }
-            std::vector<int64_t> kv_idx = {phisical_kv_idx};
-            accum_t* in_ptr0 = qk_data + row * cur_kvSplitSize + col;
-            auto in_ptr1 = b_idx.data();
-            auto in_ptr2 = h_idx.data();
-            auto in_ptr3 = q_idx.data();
-            auto in_ptr4 = kv_idx.data();
+        }
+
+        std::vector<int64_t> b_idx = {i};
+        std::vector<int64_t> h_idx = {j};
+
+        accum_t* in_ptr0 = qk_data;
+
+        auto in_ptr1 = b_idx.data();
+        auto in_ptr2 = h_idx.data();
+        auto in_ptr3 = q_idx.data();
+        auto in_ptr4 = kv_idx.data();
+
+        // apply score mod function
+        {
             {{ template.generate_other_buffer("score_others", 0, "len_score_other", kernel.args) }}
             accum_t* out_ptr{{score_buf_idx}} = in_ptr0;
-            {{ template.modification(score_mod, score_buf_name, score_buf_idx) }}
-          }
+            {{ template.modification(score_mod, score_buf_name, score_buf_idx)|indent(12, false) }}
         }
+
         // Apply block mask, fill unused with -inf
-        for (int64_t row = 0; row < cur_qSplitSize; ++row) {
-          for (int64_t col = 0; col < cur_kvSplitSize; col++) {
-            std::vector<int64_t> b_idx = {i};
-            std::vector<int64_t> h_idx = {j};
-            std::vector<int64_t> q_idx = {m+row};
-            int64_t phisical_kv_idx = n+col;
-            if (use_kv_indice) {
-                phisical_kv_idx= *kv_logical_data * kvBlockSize + col;
-            }
-            std::vector<int64_t> kv_idx = {phisical_kv_idx};
-            accum_t* qk_block = qk_data + row * cur_kvSplitSize + col;
-            auto in_ptr1 = b_idx.data();
-            auto in_ptr2 = h_idx.data();
-            auto in_ptr3 = q_idx.data();
-            auto in_ptr4 = kv_idx.data();
+        {
             {{ template.generate_other_buffer("mask_others", -1, "len_mask_other", kernel.args) }}
-            std::vector<int64_t> temp = {0};
-            int64_t* out_ptr{{mask_buf_idx}} = temp.data();
-            {{ template.modification(mask_mod, mask_buf_name, mask_buf_idx) }}
-            *qk_block = *out_ptr{{mask_buf_idx}} != 0
-                            ? *qk_block
-                            : -std::numeric_limits<accum_t>::infinity();
-          }
+            accum_t* out_ptr{{mask_buf_idx}} = in_ptr0;
+            {{ template.modification(mask_mod, mask_buf_name, mask_buf_idx)|indent(12, false) }}
         }
+
 {%- endif %}
         // Update coefficients with Softmax
         accum_t tmp_max = 0, tmp_sum = 0, exp_tmp = 0;
@@ -792,6 +784,7 @@ def __init__(
         len_score_other,
         len_mask_other,
         kernel_input_name_to_buffer,
+        block_vars,
     ) -> None:
         assert layout.dtype in [torch.float, torch.bfloat16]
         super().__init__("flex_attention", input_nodes, layout, parallel_num_threads())
@@ -824,6 +817,7 @@ def get_idx(buf_name):
         self.len_score_other = len_score_other
         self.len_mask_other = len_mask_other
         self.kernel_input_name_to_buffer = kernel_input_name_to_buffer
+        self.block_vars = block_vars
         self.extra_sizevars = list(
             OrderedSet(
                 val
@@ -935,14 +929,15 @@ def modification(self, subgraph_buffer, output_name, output_idx):
         cpp_kernel_proxy = CppKernelProxy(kernel_group)
         bodies = []
         var_sizes_list = []
-
-        var_sizes = tuple([])  # type: ignore[var-annotated]  # noqa: C409
-        output_index = 0
+        var_sizes = tuple(subgraph_buffer.get_size())
         var_ranges = {
             sympy_index_symbol_with_prefix(SymT.INDEX, i): sz
             for i, sz in enumerate(var_sizes)
         }
 
+        dst_layout = subgraph_buffer.get_layout()
+        output_index = dst_layout.make_indexer()([*var_ranges.keys()])
+
         def fn(*args):
             V.ops.store(
                 output_name,
@@ -970,7 +965,24 @@ def fn(*args):
 
         cpp_kernel_proxy.codegen_loop_bodies(bodies, var_sizes_list)
         kernel_group.finalize_kernel(cpp_kernel_proxy, [])
-        return kernel_group.loops_code.getvalue()
+        output_code = kernel_group.loops_code.getvalue()
+
+        var_q_symbol, var_kv_symbol = self.block_vars
+        # See [Note] Handle the case where the split sizes are not statically known.
+        # We don't know the value of qBlockSize and rkvBlockSize during compilation time
+        # thus we've represented them by symbols.
+        # We change the symbol strings back to "cur_qSplitSize" and "cur_kvSplitSize"
+        # in the generated code thus they'll be filled with the real value during runtime.
+        if var_q_symbol in kernel_group.args.sizevars:
+            output_code = output_code.replace(
+                kernel_group.args.sizevars[var_q_symbol], "cur_qSplitSize"
+            )
+        if var_kv_symbol in kernel_group.args.sizevars:
+            output_code = output_code.replace(
+                kernel_group.args.sizevars[var_kv_symbol], "cur_kvSplitSize"
+            )
+
+        return output_code
 
     @staticmethod
     def add_choices(
@@ -987,6 +999,7 @@ def add_choices(
         len_score_other,
         len_mask_other,
         kernel_input_name_to_buffer,
+        block_vars,
     ):
         def preprocessor(input_nodes, layout):
             return input_nodes, layout
@@ -1010,6 +1023,7 @@ def postprocessor(output):
             len_score_other=len_score_other,
             len_mask_other=len_mask_other,
             kernel_input_name_to_buffer=kernel_input_name_to_buffer,
+            block_vars=block_vars,
         )
         template.maybe_append_choice(choices)
         return template
diff --git a/torch/_inductor/codegen/cpp_template_kernel.py b/torch/_inductor/codegen/cpp_template_kernel.py
@@ -502,6 +502,10 @@ def store_outputs(
                 )
                 return ""
 
+    def check_bounds(self, expr, size, lower, upper):
+        # CppTemplateKernel does not need codegen related operations
+        return
+
 
 class CppTemplateCaller(ir.ChoiceCaller):
     """
diff --git a/torch/_inductor/kernel/flex_attention.py b/torch/_inductor/kernel/flex_attention.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 """ Triton Implementation of the flex_attention Kernel"""
 
+import copy
 import logging
 import math
 from collections.abc import Sequence
@@ -14,6 +15,8 @@
 from torch._inductor.virtualized import V
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._pytree import tree_map
+from torch.utils._sympy.numbers import int_oo
+from torch.utils._sympy.value_ranges import ValueRanges
 
 from .. import config
 from ..ir import (
@@ -100,10 +103,21 @@ def flex_attention_grid(batch_size, q_heads, num_queries, d_model, meta):
 
 
 def create_placeholder(
-    name: str, dtype: torch.dtype, device: torch.device
+    name: str,
+    dtype: torch.dtype,
+    device: torch.device,
+    size: Optional[list[int]] = None,
 ) -> TensorBox:
     """Creates a placeholder input buffers for producing subgraph_output."""
-    input_buffer = InputBuffer(name=name, layout=FixedLayout(device, dtype, [], []))
+    input_buffer = InputBuffer(
+        name=name,
+        layout=FixedLayout(
+            device,
+            dtype,
+            size if size else [],
+            FlexibleLayout.contiguous_strides(size) if size else [],
+        ),
+    )
     return TensorBox.create(input_buffer)
 
 
@@ -173,7 +187,9 @@ def zeros_and_scatter_lowering(shape: list[int], indices, values):
 SubgraphResults = Union[list[Optional[ComputedBuffer]], Optional[ComputedBuffer]]
 
 
-def build_subgraph_buffer(args: list[TensorBox], subgraph: Subgraph) -> SubgraphResults:
+def build_subgraph_module_buffer(
+    args: list[TensorBox], graph_module: torch.fx.GraphModule
+) -> SubgraphResults:
     """This function's goal is to take in the required args and produce the subgraph buffer
     The subgraph buffer is a ComputedBuffer that will be inlined into the triton template
 
@@ -184,7 +200,7 @@ def build_subgraph_buffer(args: list[TensorBox], subgraph: Subgraph) -> Subgraph
     from ..subgraph_lowering import PointwiseSubgraphLowering
 
     pw_subgraph = PointwiseSubgraphLowering(
-        subgraph.graph_module,
+        graph_module,
         root_graph_lowering=V.graph,
         allowed_mutations=OrderedSet([torch.ops.flex_lib.zeros_and_scatter.default]),
         additional_lowerings={
@@ -228,6 +244,10 @@ def convert_output_node_to_buffer(output_buffer) -> Optional[ComputedBuffer]:
     return tree_map(convert_output_node_to_buffer, pw_subgraph.graph_outputs)
 
 
+def build_subgraph_buffer(args: list[TensorBox], subgraph: Subgraph) -> SubgraphResults:
+    return build_subgraph_module_buffer(args, subgraph.graph_module)
+
+
 # Inner Triton functions shared by flex_attention & split-k decoding kernels.
 compute_next_offset_func = r"""
 @triton.jit
@@ -921,14 +941,31 @@ def lower_cpu(
         )
 
     fake_buffers: list[Buffer] = []  # noqa: F821
+
+    # [Note] Handle the case where the split sizes are not statically known.
+    # The value of cur_qSplitSize and cur_kvSplitSize are decided during runtime.
+    # We use symbols to represent them during the compilation here.
+    # They'll be replaced by the string "cur_qSplitSize" and "cur_kvSplitSize" in
+    # the modification function of the CppFlexAttentionTemplate class.
+    cur_qSplitSize = V.graph.sizevars.shape_env.create_unbacked_symint().node.expr
+    cur_kvSplitSize = V.graph.sizevars.shape_env.create_unbacked_symint().node.expr
+    shape_env = V.graph.sizevars.shape_env
+
+    # We don't know the concret value of cur_qSplitSize and cur_kvSplitSize during the compilation.
+    # Mark symbols > 1 to ensure broadcasting is always applied.
+    # This avoids treating them as equal when `eq(var, 1)` is evaluated in `broadcast_symbolic_shapes`.
+    shape_env.var_to_range[cur_qSplitSize] = ValueRanges(2, int_oo)
+    shape_env.var_to_range[cur_kvSplitSize] = ValueRanges(2, int_oo)
+
+    score_dtype = torch.float
     placeholder_inps = [
-        create_placeholder(name, dtype, query.get_device())
-        for name, dtype in [
-            ("score", torch.float),
-            ("b", torch.int64),
-            ("h", torch.int64),
-            ("q_idx", torch.int64),
-            ("kv_idx", torch.int64),
+        create_placeholder(name, dtype, query.get_device(), size)
+        for name, dtype, size in [
+            ("score", score_dtype, [cur_qSplitSize, cur_kvSplitSize]),
+            ("b", torch.int64, []),
+            ("h", torch.int64, []),
+            ("q_idx", torch.int64, [cur_qSplitSize, 1]),
+            ("kv_idx", torch.int64, [1, cur_kvSplitSize]),
         ]
     ]
     subgraph_buffer = build_subgraph_buffer(
@@ -942,18 +979,83 @@ def lower_cpu(
         else:
             subgraph_buffer.freeze_layout()
     mask_graph_placeholder_inps = [
-        create_placeholder(name, dtype, query.get_device())
-        for name, dtype in [
-            ("b", torch.int64),
-            ("h", torch.int64),
-            ("q_idx", torch.int64),
-            ("kv_idx", torch.int64),
+        create_placeholder(name, dtype, query.get_device(), size)
+        for name, dtype, size in [
+            ("score", score_dtype, [cur_qSplitSize, cur_kvSplitSize]),
+            ("b", torch.int64, []),
+            ("h", torch.int64, []),
+            ("q_idx", torch.int64, [cur_qSplitSize, 1]),
+            ("kv_idx", torch.int64, [1, cur_kvSplitSize]),
         ]
     ]
-    mask_graph_buffer = build_subgraph_buffer(
-        mask_graph_placeholder_inps + list(mask_mod_other_buffers), mask_graph
+
+    # The original mask_graph works on a scalar and only includes
+    # the logic of calculating the mask value.
+    # We need to add the logic of applying the mark to the qk_data tensor
+    # into the graph for the later codegen of this part.
+    # Example:
+    #   mask_graph:
+    #   def mask_fn(b, h, q_idx, kv_idx):
+    #       mask = q_idx >= kv_idx
+    #       return mask
+    #   The converted_mask_graph should be:
+    #   def converted_mask_fn(qk_data, b, h, q_idx, kv_idx):
+    #       mask = q_idx >= kv_idx
+    #       qk_data = torch.where(mask, qk_data, torch.full_like(qk_data, -float("inf")))
+    #       return qk_data
+    def convert_mask_graph_module(mask_graph):
+        gm = copy.deepcopy(mask_graph.graph_module)
+        graph = gm.graph
+        # Add qk_data as the first input
+        with graph.inserting_before(next(iter(graph.nodes))):
+            qk_data_node = graph.placeholder("qk_data")
+
+        # Find the node that returns the mask
+        output_node = None
+        for node in graph.nodes:
+            if node.op == "output":
+                output_node = node
+                break
+
+        # Get the mask node
+        assert output_node is not None
+        mask_node = output_node.args[0]
+
+        size_node = [cur_qSplitSize, cur_kvSplitSize]
+        # Create a new node for torch.full
+        with graph.inserting_after(mask_node):
+            full_node = graph.call_function(
+                torch.full,
+                args=(size_node, -float("inf")),
+                kwargs={"dtype": score_dtype},
+            )
+
+        # Create a new node for torch.where
+        with graph.inserting_after(full_node):
+            where_node = graph.call_function(
+                torch.ops.aten.where, args=(mask_node, qk_data_node, full_node)
+            )
+
+        # Update the output node to return the result of torch.where
+        output_node.args = (where_node,)
+
+        graph.lint()
+        converted = torch.fx.GraphModule(gm, graph)
+        return converted
+
+    converted_mask_graph_module = convert_mask_graph_module(mask_graph)
+
+    mask_graph_buffer = build_subgraph_module_buffer(
+        mask_graph_placeholder_inps + list(mask_mod_other_buffers),
+        converted_mask_graph_module,
     )
 
+    # Clear the pending fresh unbacked symbols that are created for cur_qSplitSize and cur_kvSplitSize in the current kernel.
+    pending = V.graph.sizevars.shape_env.pending_fresh_unbacked_symbols
+    V.graph.sizevars.shape_env.pending_fresh_unbacked_symbols = [
+        x for x in pending if x not in (cur_qSplitSize, cur_kvSplitSize)
+    ]
+
     buffer_list = (
         placeholder_inps
         + list(score_mod_other_buffers)
@@ -1066,6 +1168,7 @@ def lower_cpu(
         len_score_other=len(score_mod_other_buffers),
         len_mask_other=len(mask_mod_other_buffers),
         kernel_input_name_to_buffer=kernel_input_name_to_buffer,
+        block_vars=(cur_qSplitSize, cur_kvSplitSize),
     )
     inputs_for_autotuning = [
         query,

Original file line number	Diff line number	Diff line change
`@@ -502,6 +502,10 @@ def store_outputs(`
`502`	`502`	`)`
`503`	`503`	`return ""`
`504`	`504`
	`505`	`+ def check_bounds(self, expr, size, lower, upper):`
	`506`	`+ # CppTemplateKernel does not need codegen related operations`
	`507`	`+ return`
	`508`	`+`
`505`	`509`
`506`	`510`	`class CppTemplateCaller(ir.ChoiceCaller):`
`507`	`511`	`"""`