Merge branch 'main' into fix-doc-build-version-theme

svekars · web-flow · commit 72e8a2aee775 · 2025-11-05T11:14:35.000-08:00
diff --git a/backends/arm/tosa/backend.py b/backends/arm/tosa/backend.py
@@ -133,7 +133,14 @@ def _preprocess(  # noqa: C901
         if not artifact_path:
             artifact_path = ""
 
-        tosa_graph = ts.TosaSerializer(artifact_path)
+        version = tosa_spec.version
+        tosa_graph = ts.TosaSerializer(
+            artifact_path,
+            targetMajor=version.major,
+            targetMinor=version.minor,
+            targetPatch=version.micro,
+            targetDraft=False,
+        )
 
         if not (
             tosa_spec.version.major == ts.TOSA_VERSION_MAJOR
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
@@ -687,7 +687,7 @@ def register_dequantize_for_conv2d_op():
 @update_features("llama::sdpa_with_kv_cache")
 def register_sdpa_with_kv_cache_op():
     return OpFeatures(
-        inputs_storage=utils.WIDTH_PACKED_TEXTURE,
+        inputs_storage=utils.CONTIGUOUS_ANY,
         supports_resize=True,
         supports_prepacking=True,
     )
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weights_softmax.glsl b/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weights_softmax.glsl
@@ -76,6 +76,7 @@ void main() {
   const int Q_H = q_projected_sizes.y;
   // sequence length
   const int S = q_projected_sizes.z;
+  const int S_aligned = align_up_4(S);
   // manually determine size of the context_len dim of the attention weight.
   // The "actual" tensor sizes may have been aligned to a multiple of 4 to allow
   // memory loads to be aligned to texel boundaries.
@@ -96,7 +97,7 @@ void main() {
   // number of threads in the work group.
   for (int c4 = worker_id; c4 < C4_limit; c4 += NUM_WORKERS_PER_WG) {
     VEC4_T in_texel = load_attn_weights_c4(
-        c4, s, q_h, context_texel_len, S, Q_H);
+        c4, s, q_h, context_texel_len, S_aligned, Q_H);
 
     for (int comp = 0; comp < 4; comp++) {
       local_exp_sum += exp(in_texel[comp]);
@@ -108,7 +109,7 @@ void main() {
     for (int c4 = C4_limit; c4 < context_texel_len; ++c4) {
       const int c_base = mul_4(c4);
       VEC4_T in_texel = load_attn_weights_c4(
-          c4, s, q_h, context_texel_len, S, Q_H);
+          c4, s, q_h, context_texel_len, S_aligned, Q_H);
 
       [[unroll]] for (int comp = 0; comp < 4; comp++) {
         if (c_base + comp < context_len) {
@@ -138,19 +139,19 @@ void main() {
   // Now go back through each element in the row and normalize
   for (int c4 = worker_id; c4 < C4_limit; c4 += NUM_WORKERS_PER_WG) {
     VEC4_T in_texel = load_attn_weights_c4(
-        c4, s, q_h, context_texel_len, S, Q_H);
+        c4, s, q_h, context_texel_len, S_aligned, Q_H);
 
     VEC4_T out_texel = exp(in_texel) / local_exp_sum;
     store_attn_weights_softmax_c4(
-        out_texel, c4, s, q_h, context_texel_len, S, Q_H);
+        out_texel, c4, s, q_h, context_texel_len, S_aligned, Q_H);
   }
   // First thread in the work group responsible for handling last texel if it
   // contains any padded elements
   if (worker_id == 0) {
     for (int c4 = C4_limit; c4 < context_texel_len; ++c4) {
       const int c_base = mul_4(c4);
       VEC4_T in_texel = load_attn_weights_c4(
-          c4, s, q_h, context_texel_len, S, Q_H);
+          c4, s, q_h, context_texel_len, S_aligned, Q_H);
 
       // Ensure that padding elements are set to 0.
       VEC4_T out_texel = VEC4_T(0);
@@ -160,7 +161,7 @@ void main() {
         }
       }
       store_attn_weights_softmax_c4(
-          out_texel, c4, s, q_h, context_texel_len, S, Q_H);
+          out_texel, c4, s, q_h, context_texel_len, S_aligned, Q_H);
     }
   }
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sdpa_compute_attn_weights_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/sdpa_compute_attn_weights_coop.glsl
@@ -81,6 +81,7 @@ void main() {
   const int Q_H = q_projected_sizes.y;
   // sequence length
   const int S = q_projected_sizes.z;
+  const int S_aligned = align_up_4(S);
 
   // number of K/V heads
   const int KV_H = k_cache_sizes.y;
@@ -118,55 +119,27 @@ void main() {
   }
   // Otherwise, need to actually compute output tile
   else {
-    const bool dont_check_bounds = (S - s) >= TILE_M &&
-        (context_len - c) >= TILE_N;
-
-    if (dont_check_bounds) {
-      for (int d4 = worker_id; d4 < D4; d4 += NUM_WORKERS_PER_OUT) {
-        load_q_projected_tile_no_checks(
-          q_tile,
-          d4,
-          s,
-          q_h,
-          D4,
-          Q_H,
-          S);
-
-        load_k_cache_tile_no_checks(
-          w_tile,
-          d4,
-          c,
-          kv_h,
-          D4,
-          context_len,
-          C,
-          KV_H);
-
-        fp_accumulate_with_fp_weight(out_tile, q_tile, w_tile);
-      }
-    } else {
-      for (int d4 = worker_id; d4 < D4; d4 += NUM_WORKERS_PER_OUT) {
-        load_q_projected_tile_with_checks(
-          q_tile,
-          d4,
-          s,
-          q_h,
-          D4,
-          Q_H,
-          S);
-
-        load_k_cache_tile_with_checks(
-          w_tile,
-          d4,
-          c,
-          kv_h,
-          D4,
-          context_len,
-          C,
-          KV_H);
-
-        fp_accumulate_with_fp_weight(out_tile, q_tile, w_tile);
-      }
+    for (int d4 = worker_id; d4 < D4; d4 += NUM_WORKERS_PER_OUT) {
+      load_q_projected_tile_with_checks(
+        q_tile,
+        d4,
+        s,
+        q_h,
+        D4,
+        Q_H,
+        S);
+
+      load_k_cache_tile_with_checks(
+        w_tile,
+        d4,
+        c,
+        kv_h,
+        D4,
+        context_len,
+        C,
+        KV_H);
+
+      fp_accumulate_with_fp_weight(out_tile, q_tile, w_tile);
     }
   }
 
@@ -205,7 +178,7 @@ void main() {
       s,
       q_h,
       context_texel_len,
-      S,
+      S_aligned,
       Q_H);
   }
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sdpa_compute_attn_weights_coop.yaml b/backends/vulkan/runtime/graph/ops/glsl/sdpa_compute_attn_weights_coop.yaml
@@ -12,10 +12,14 @@ sdpa_compute_attn_weights_coop:
     TILE_K4: 1
     TILE_N4: 1
   generate_variant_forall:
+    combination:
+      parameter_names: [IO_STORAGE, K_CACHE_STORAGE]
+      combos:
+        - parameter_values: [texture3d, texture3d]
+        - parameter_values: [buffer, texture3d]
+        - parameter_values: [buffer, buffer]
     DTYPE:
       - VALUE: float
       - VALUE: half
   shader_variants:
-    - NAME: sdpa_compute_attn_weights_coop_texture3d_texture3d
-    - NAME: sdpa_compute_attn_weights_coop_buffer_texture3d
-      IO_STORAGE: buffer
+    - NAME: sdpa_compute_attn_weights_coop
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sdpa_compute_attn_weights_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/sdpa_compute_attn_weights_tiled.glsl
@@ -93,6 +93,7 @@ void main() {
   const int Q_H = q_projected_sizes.y;
   // sequence length
   const int S = q_projected_sizes.z;
+  const int S_aligned = align_up_4(S);
 
   // number of K/V heads
   const int KV_H = k_cache_sizes.y;
@@ -129,55 +130,28 @@ void main() {
   }
   // Otherwise, need to actually compute output tile
   else {
-    const bool dont_check_bounds = (S - s) >= TILE_M &&
-        (context_len - c) >= TILE_N;
-
-    if (dont_check_bounds) {
-      for (int d4 = 0; d4 < D4; d4++) {
-        load_q_projected_tile_no_checks(
-          q_tile,
-          d4,
-          s,
-          q_h,
-          D4,
-          Q_H,
-          S);
-
-        load_k_cache_tile_no_checks(
-          w_tile,
-          d4,
-          c,
-          kv_h,
-          D4,
-          context_len,
-          C,
-          KV_H);
-
-        fp_accumulate_with_fp_weight(out_tile, q_tile, w_tile);
-      }
-    } else {
-      for (int d4 = 0; d4 < D4; d4++) {
-        load_q_projected_tile_with_checks(
-          q_tile,
-          d4,
-          s,
-          q_h,
-          D4,
-          Q_H,
-          S);
-
-        load_k_cache_tile_with_checks(
-          w_tile,
-          d4,
-          c,
-          kv_h,
-          D4,
-          context_len,
-          C,
-          KV_H);
-
-        fp_accumulate_with_fp_weight(out_tile, q_tile, w_tile);
-      }
+    for (int d4 = 0; d4 < D4; d4++) {
+      load_q_projected_tile_with_checks(
+        q_tile,
+        d4,
+        s,
+        q_h,
+        D4,
+        Q_H,
+        S);
+
+      load_k_cache_tile_with_checks(
+        w_tile,
+        d4,
+        c,
+        kv_h,
+        D4,
+        context_len,
+        C,
+        KV_H);
+
+
+      fp_accumulate_with_fp_weight(out_tile, q_tile, w_tile);
     }
 
     // Apply scale and mask
@@ -196,6 +170,6 @@ void main() {
     s,
     q_h,
     context_texel_len,
-    S,
+    S_aligned,
     Q_H);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sdpa_compute_attn_weights_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/sdpa_compute_attn_weights_tiled.yaml
@@ -13,10 +13,14 @@ sdpa_compute_attn_weights_tiled:
     TILE_K4: 1
     TILE_N4: 1
   generate_variant_forall:
+    combination:
+      parameter_names: [IO_STORAGE, K_CACHE_STORAGE]
+      combos:
+        - parameter_values: [texture3d, texture3d]
+        - parameter_values: [buffer, texture3d]
+        - parameter_values: [buffer, buffer]
     DTYPE:
       - VALUE: float
       - VALUE: half
   shader_variants:
-    - NAME: sdpa_compute_attn_weights_tiled_texture3d_texture3d
-    - NAME: sdpa_compute_attn_weights_tiled_buffer_texture3d
-      IO_STORAGE: buffer
+    - NAME: sdpa_compute_attn_weights_tiled
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sdpa_compute_out_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/sdpa_compute_out_coop.glsl
@@ -81,6 +81,7 @@ void main() {
   const int Q_H = q_projected_sizes.y;
   // sequence length
   const int S = q_projected_sizes.z;
+  const int S_aligned = align_up_4(S);
 
   // number of K/V heads
   const int KV_H = v_cache_sizes.y;
@@ -120,7 +121,7 @@ void main() {
       s,
       q_h,
       context_texel_len,
-      S,
+      S_aligned,
       Q_H);
 
     load_v_cache_tile_no_checks(
@@ -146,7 +147,7 @@ void main() {
         s,
         q_h,
         context_texel_len,
-        S,
+        S_aligned,
         Q_H);
 
       load_v_cache_tile_with_checks(
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sdpa_compute_out_coop.yaml b/backends/vulkan/runtime/graph/ops/glsl/sdpa_compute_out_coop.yaml
@@ -12,10 +12,14 @@ sdpa_compute_out_coop:
     TILE_K4: 1
     TILE_N4: 1
   generate_variant_forall:
+    combination:
+      parameter_names: [IO_STORAGE, V_CACHE_STORAGE]
+      combos:
+        - parameter_values: [texture3d, texture3d]
+        - parameter_values: [buffer, texture3d]
+        - parameter_values: [buffer, buffer]
     DTYPE:
       - VALUE: float
       - VALUE: half
   shader_variants:
-    - NAME: sdpa_compute_out_coop_texture3d_texture3d
-    - NAME: sdpa_compute_out_coop_buffer_texture3d
-      IO_STORAGE: buffer
+    - NAME: sdpa_compute_out_coop
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sdpa_compute_out_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/sdpa_compute_out_tiled.glsl
@@ -75,6 +75,7 @@ void main() {
   const int Q_H = q_projected_sizes.y;
   // sequence length
   const int S = q_projected_sizes.z;
+  const int S_aligned = align_up_4(S);
 
   // number of K/V heads
   const int KV_H = v_cache_sizes.y;
@@ -113,7 +114,7 @@ void main() {
       s,
       q_h,
       context_texel_len,
-      S,
+      S_aligned,
       Q_H);
 
     load_v_cache_tile_no_checks(
@@ -136,7 +137,7 @@ void main() {
       s,
       q_h,
       context_texel_len,
-      S,
+      S_aligned,
       Q_H);
 
     load_v_cache_tile_with_checks(
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sdpa_compute_out_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/sdpa_compute_out_tiled.yaml
@@ -13,10 +13,14 @@ sdpa_compute_out_tiled:
     TILE_K4: 1
     TILE_N4: 1
   generate_variant_forall:
+    combination:
+      parameter_names: [IO_STORAGE, V_CACHE_STORAGE]
+      combos:
+        - parameter_values: [texture3d, texture3d]
+        - parameter_values: [buffer, texture3d]
+        - parameter_values: [buffer, buffer]
     DTYPE:
       - VALUE: float
       - VALUE: half
   shader_variants:
-    - NAME: sdpa_compute_out_tiled_texture3d_texture3d
-    - NAME: sdpa_compute_out_tiled_buffer_texture3d
-      IO_STORAGE: buffer
+    - NAME: sdpa_compute_out_tiled
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sdpa_kv_cache_update.glsl b/backends/vulkan/runtime/graph/ops/glsl/sdpa_kv_cache_update.glsl
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sdpa_kv_cache_update.yaml b/backends/vulkan/runtime/graph/ops/glsl/sdpa_kv_cache_update.yaml
diff --git a/backends/vulkan/runtime/graph/ops/impl/SDPA.cpp b/backends/vulkan/runtime/graph/ops/impl/SDPA.cpp
diff --git a/backends/vulkan/test/op_tests/sdpa_test.cpp b/backends/vulkan/test/op_tests/sdpa_test.cpp
diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh

Original file line number	Diff line number	Diff line change
`@@ -687,7 +687,7 @@ def register_dequantize_for_conv2d_op():`
`687`	`687`	`@update_features("llama::sdpa_with_kv_cache")`
`688`	`688`	`def register_sdpa_with_kv_cache_op():`
`689`	`689`	`return OpFeatures(`
`690`		`- inputs_storage=utils.WIDTH_PACKED_TEXTURE,`
	`690`	`+ inputs_storage=utils.CONTIGUOUS_ANY,`
`691`	`691`	`supports_resize=True,`
`692`	`692`	`supports_prepacking=True,`
`693`	`693`	`)`