pytorch · SS-JIA · Nov 5, 2025 · Nov 4, 2025 · Nov 4, 2025
@@ -630,7 +630,7 @@ def register_dequantize_for_conv2d_op():
 @update_features("llama::sdpa_with_kv_cache")
 def register_sdpa_with_kv_cache_op():
     return OpFeatures(
-        inputs_storage=utils.WIDTH_PACKED_TEXTURE,
+        inputs_storage=utils.CONTIGUOUS_ANY,
         supports_resize=True,
         supports_prepacking=True,
     )

@@ -12,10 +12,14 @@ sdpa_compute_attn_weights_coop:
     TILE_K4: 1
     TILE_N4: 1
   generate_variant_forall:
+    combination:
+      parameter_names: [IO_STORAGE, K_CACHE_STORAGE]
+      combos:
+        - parameter_values: [texture3d, texture3d]
+        - parameter_values: [buffer, texture3d]
+        - parameter_values: [buffer, buffer]
     DTYPE:
       - VALUE: float
       - VALUE: half
   shader_variants:
-    - NAME: sdpa_compute_attn_weights_coop_texture3d_texture3d
-    - NAME: sdpa_compute_attn_weights_coop_buffer_texture3d
-      IO_STORAGE: buffer
+    - NAME: sdpa_compute_attn_weights_coop
@@ -13,10 +13,14 @@ sdpa_compute_attn_weights_tiled:
     TILE_K4: 1
     TILE_N4: 1
   generate_variant_forall:
+    combination:
+      parameter_names: [IO_STORAGE, K_CACHE_STORAGE]
+      combos:
+        - parameter_values: [texture3d, texture3d]
+        - parameter_values: [buffer, texture3d]
+        - parameter_values: [buffer, buffer]
     DTYPE:
       - VALUE: float
       - VALUE: half
   shader_variants:
-    - NAME: sdpa_compute_attn_weights_tiled_texture3d_texture3d
-    - NAME: sdpa_compute_attn_weights_tiled_buffer_texture3d
-      IO_STORAGE: buffer
+    - NAME: sdpa_compute_attn_weights_tiled
@@ -12,10 +12,14 @@ sdpa_compute_out_coop:
     TILE_K4: 1
     TILE_N4: 1
   generate_variant_forall:
+    combination:
+      parameter_names: [IO_STORAGE, V_CACHE_STORAGE]
+      combos:
+        - parameter_values: [texture3d, texture3d]
+        - parameter_values: [buffer, texture3d]
+        - parameter_values: [buffer, buffer]
     DTYPE:
       - VALUE: float
       - VALUE: half
   shader_variants:
-    - NAME: sdpa_compute_out_coop_texture3d_texture3d
-    - NAME: sdpa_compute_out_coop_buffer_texture3d
-      IO_STORAGE: buffer
+    - NAME: sdpa_compute_out_coop
@@ -13,10 +13,14 @@ sdpa_compute_out_tiled:
     TILE_K4: 1
     TILE_N4: 1
   generate_variant_forall:
+    combination:
+      parameter_names: [IO_STORAGE, V_CACHE_STORAGE]
+      combos:
+        - parameter_values: [texture3d, texture3d]
+        - parameter_values: [buffer, texture3d]
+        - parameter_values: [buffer, buffer]
     DTYPE:
       - VALUE: float
       - VALUE: half
   shader_variants:
-    - NAME: sdpa_compute_out_tiled_texture3d_texture3d
-    - NAME: sdpa_compute_out_tiled_buffer_texture3d
-      IO_STORAGE: buffer
+    - NAME: sdpa_compute_out_tiled
@@ -5,6 +5,8 @@
 #define IN_VEC4_T ${texel_load_type(DTYPE, INPUT_STORAGE)}
 #define T ${buffer_scalar_type(DTYPE)}
 
+$if OUTPUT_STORAGE == "buffer":
+  #define OUTPUT_BUFFER
 $if INPUT_STORAGE == "buffer":
   #define INPUT_BUFFER
 

@@ -10,10 +10,14 @@ sdpa_kv_cache_update:
     INPUT_STORAGE: texture3d
     OUTPUT_STORAGE: texture3d
   generate_variant_forall:
+    combination:
+      parameter_names: [OUTPUT_STORAGE, INPUT_STORAGE]
+      combos:
+        - parameter_values: [texture3d, texture3d]
+        - parameter_values: [texture3d, buffer]
+        - parameter_values: [buffer, buffer]
     DTYPE:
       - VALUE: half
       - VALUE: float
   shader_variants:
-    - NAME: sdpa_kv_cache_update_texture3d
-    - NAME: sdpa_kv_cache_update_buffer
-      INPUT_STORAGE: buffer
+    - NAME: sdpa_kv_cache_update
@@ -282,6 +282,7 @@ void add_sdpa_kv_cache_update_node(
     const ValueRef projected,
     const ValueRef cache) {
   std::string kernel_name("sdpa_kv_cache_update");
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(cache));
   add_storage_type_suffix(kernel_name, graph.storage_type_of(projected));
   add_dtype_suffix(kernel_name, graph.dtype_of(projected));
 
@@ -525,10 +526,11 @@ void sdpa_with_kv_cache_impl(
 
   (void)sequence_len;
 
-  const ValueRef k_cache = prepack_standard(
-      graph, k_cache_data, utils::kTexture3D, utils::kWidthPacked);
-  const ValueRef v_cache = prepack_standard(
-      graph, v_cache_data, utils::kTexture3D, utils::kWidthPacked);
+  utils::StorageType cache_storage = graph.storage_type_of(q_projected);
+  const ValueRef k_cache =
+      prepack_standard(graph, k_cache_data, cache_storage, utils::kWidthPacked);
+  const ValueRef v_cache =
+      prepack_standard(graph, v_cache_data, cache_storage, utils::kWidthPacked);
 
   update_cache_impl(graph, {k_projected, k_cache, input_pos_symint, -1});
   update_cache_impl(graph, {v_projected, v_cache, input_pos_symint, -1});
@@ -546,10 +548,51 @@ void sdpa_with_kv_cache_impl(
        out});
 }
 
+void compute_attn_weight_with_kv_cache_impl(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  int arg_idx = 0;
+  const ValueRef q_projected = args[arg_idx++];
+  const ValueRef k_projected = args[arg_idx++];
+  const ValueRef v_projected = args[arg_idx++];
+  const ValueRef k_cache_data = args[arg_idx++];
+  const ValueRef v_cache_data = args[arg_idx++];
+  const ValueRef input_pos_symint = args[arg_idx++];
+  const ValueRef sequence_len = args[arg_idx++];
+  const ValueRef attn_mask = args[arg_idx++];
+  (void)attn_mask;
+  const ValueRef dropout_p = args[arg_idx++];
+  (void)dropout_p;
+  const ValueRef is_causal = args[arg_idx++];
+  (void)is_causal;
+  const ValueRef scale = args[arg_idx++];
+  (void)scale;
+
+  // Output tensors
+  const ValueRef out = args[arg_idx++];
+
+  (void)sequence_len;
+
+  utils::StorageType cache_storage = graph.storage_type_of(q_projected);
+  const ValueRef k_cache =
+      prepack_standard(graph, k_cache_data, cache_storage, utils::kWidthPacked);
+  const ValueRef v_cache =
+      prepack_standard(graph, v_cache_data, cache_storage, utils::kWidthPacked);
+
+  update_cache_impl(graph, {k_projected, k_cache, input_pos_symint, -1});
+  update_cache_impl(graph, {v_projected, v_cache, input_pos_symint, -1});
+
+  add_sdpa_compute_attn_weights_node(
+      graph, q_projected, k_cache, input_pos_symint, out);
+}
+
 REGISTER_OPERATORS {
   VK_REGISTER_OP(sdpa_with_kv_cache.default, sdpa_with_kv_cache_impl);
   VK_REGISTER_OP(update_cache.default, update_cache_impl);
   VK_REGISTER_OP(llama.custom_sdpa.default, sdpa_impl);
+  VK_REGISTER_OP(
+      testing.compute_attn_weight_with_kv_cache.default,
+      compute_attn_weight_with_kv_cache_impl);
 }
 
 } // namespace vkcompute