Modify depthwise int8 conv2d to reduce register/memory pressure (#16054)

GregoryComer · facebook-github-bot · commit d576add4b4a9 · 2025-12-02T17:37:08.000-08:00
Summary:

Modify the Vulkan int8 depthwise convolution shader to reduce register pressure by not loading the full input window upfront. On Mali, it seems to spill to main memory with significant performance impact. This is a relatively naive implementation that largely just loads as needed. It shows significant speedup on Mali-G720, though it regresses Adreno performance by 10-20%.

There is likely a lot of room for additional optimization here. In particular, optimizing cycle count by reducing bounds checks, looking at improvements to read coalescing, and using spec constants for the major params.

Differential Revision: D88183224
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8_utils.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8_utils.glslh
@@ -11,20 +11,6 @@
 
 #extension GL_EXT_control_flow_attributes : require
 
-struct InputWindow1D {
-  vec4[MAX_WINDOW_WIDTH] data;
-  int len;
-};
-
-InputWindow1D initial_input_window() {
-  InputWindow1D input_window;
-  for (int i = 0; i < MAX_WINDOW_WIDTH; ++i) {
-    input_window.data[i] = vec4(0);
-  }
-  input_window.len = 0;
-  return input_window;
-}
-
 vec4 dequantize(const int packed_texel, const float scale, const int zp) {
   return vec4(unpack_int8x4(packed_texel) - zp) * scale;
 }
@@ -49,109 +35,10 @@ bool in_bounds(
   return true;
 }
 
-InputWindow1D load_input_window(
-    const int w_start,
-    const int w_end,
-    const int h,
-    const int c4,
-    const Conv2dBlockExtents block_extents,
-    const float input_scale,
-    const int input_zp,
-    const ivec4 input_zps) {
-  InputWindow1D input_window = initial_input_window();
-
-  const int block_w_start = div_4(w_start);
-  const int block_w_end = div_4(w_end);
-
-  int window_i = 0;
-  for (int block_w = block_w_start; block_w <= block_w_end; ++block_w) {
-    ivec4 input_block = input_zps;
-
-    if (in_bounds(block_w, h, c4, block_extents)) {
-#ifdef PACKED_INT8_INPUT_BUFFER
-      const int buffer_idx =
-          h * block_extents.data_xz + block_w * block_extents.data.z + c4;
-      input_block = t_packed_int8_input[buffer_idx];
-#else
-      input_block = texelFetch(t_packed_int8_input, ivec3(block_w, h, c4), 0);
-#endif
-    }
-
-    const int loaded_w_start = mul_4(block_w);
-    for (int row = 0; row < 4; ++row) {
-      if (loaded_w_start + row >= w_start && loaded_w_start + row <= w_end) {
-        input_window.data[window_i++] =
-            dequantize(input_block[row], input_scale, input_zp);
-      }
-    }
-  }
-  input_window.len = window_i;
-  return input_window;
-}
-
-struct WeightRow {
-  vec4[MAX_KERNEL_WIDTH] data;
-  int len;
-};
-
-WeightRow initial_weight_row() {
-  WeightRow weight_row;
-  for (int i = 0; i < MAX_KERNEL_WIDTH; ++i) {
-    weight_row.data[i] = vec4(0);
-  }
-  weight_row.len = 0;
-  return weight_row;
-}
-
-WeightRow load_weight_row(
-    const int oc4,
-    const int ky,
-    const int OC4,
-    const int Kw,
-    const int Kw4,
-    const vec4 weight_scales) {
-  WeightRow weight_row = initial_weight_row();
-
-  int k4 = ky * Kw4;
-  int row_idx = 0;
-  for (int w = 0; w < Kw; w += 4) {
-#ifdef WEIGHT_BUFFER
-    const ivec4 weight_block = t_packed_int8_weight[k4 * OC4 + oc4];
-#else
-    const ivec4 weight_block = texelFetch(
-        t_packed_int8_weight, ivec2(oc4, k4), 0);
-#endif
-
-    for (int row = 0; row < 4; ++row) {
-      if (w + row < Kw) {
-        weight_row.data[row_idx++] = dequantize(weight_block[row], weight_scales);
-      }
-    }
-    k4++;
-  }
-  weight_row.len = row_idx;
-  return weight_row;
-}
-
 struct FPOutBlock {
   vec4[4] data;
 };
 
-void perform_conv1d(
-    inout FPOutBlock out_block,
-    const InputWindow1D input_window,
-    const WeightRow weight_row) {
-  for (int out_w = 0; out_w < 4; ++out_w) {
-    [[unroll]] for (int kx = 0; kx < weight_row.len; ++kx) {
-      const int in_w = out_w * conv2d_params.stride.x;
-      out_block.data[out_w] = fma(
-          input_window.data[in_w + kx],
-          weight_row.data[kx],
-          out_block.data[out_w]);
-    }
-  }
-}
-
 ivec4 quantize(
     const vec4 texel, const float inv_scale, const int zp) {
   vec4 quantized = round(texel * inv_scale) + zp;
@@ -168,6 +55,48 @@ ivec4 quantize_and_pack(
   return packed_block;
 }
 
+// Load a 4xint8 block of weights - channel c through c+3 (c = oc4*4).
+// Equivalent to unpacked_weights[ky][kx][c:c+4].
+int load_weight(
+  int kx,  // w coordinate to load
+  int ky,  // h coordinate to load
+  int oc4, // channel block to load
+  int kw4, // kernel width / 4 (rounded up)
+  int OC4  // out channels / 4 (rounded up)
+  ) {
+
+  // Find the packed block index.
+  int kx4 = kx / 4; // W block
+  // Index into the packed weights for a 4W4C block.
+  int linear_idx = ((ky * kw4 + kx4) * OC4 + oc4) * 4;
+  int block_x_offset = kx % 4;
+  return t_packed_int8_weight[linear_idx + block_x_offset];
+}
+
+// Load a 4xint8 block of inputs - channel c through c+3 (c = oc4*4) at
+// the given spatial location.
+int load_input(
+  int x,   // w coordinate
+  int y,   // h coordinate
+  int oc4, // channel block
+  int OC4, // out channels / 4 (rounded up)
+  Conv2dBlockExtents block_extents
+) {
+  int block_w = x / 4;
+
+  if (in_bounds(block_w, y, oc4, block_extents) && x >= 0) {
+#ifdef PACKED_INT8_INPUT_BUFFER
+    const int buffer_idx =
+        (y * block_extents.data_xz + block_w * block_extents.data.z + oc4) * 4 + (x % 4);
+    return t_packed_int8_input[buffer_idx];
+#else
+    #error Unimplemented
+#endif
+  } else {
+    return pack_into_int32(ivec4(input_zp));
+  }
+}
+
 #ifdef DEBUG_MODE
 
 void printInputWindow1D(const InputWindow1D input_window) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8ta_q8csw_q8to.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8ta_q8csw_q8to.glsl
@@ -27,9 +27,9 @@ layout(std430) buffer;
 
 #include "conv2d_common.glslh"
 
-${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", IO_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", IO_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", IO_STORAGE, is_scalar_array=True)}
+${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", IO_STORAGE, is_scalar_array=True)}
+${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", WEIGHT_STORAGE, is_scalar_array=True)}
 ${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)}
@@ -62,60 +62,126 @@ void main() {
     return;
   }
 
+  const int out_h = out_block_idx.data.y;
   const int out_w = mul_4(out_block_idx.data.x);
-  const int w_start =
-      (out_w * conv2d_params.stride.x) - conv2d_params.padding.x;
-  const int w_end = ((out_w + 3) * conv2d_params.stride.x) -
-      conv2d_params.padding.x +
-      (conv2d_params.kernel_size.x - 1) * conv2d_params.dilation.x;
 
   Conv2dBlockExtents in_block_extents = make_block_extents(input_sizes);
 
-  const ivec4 input_zps = ivec4(pack_into_int32(ivec4(input_zp)));
-  const vec4 weight_scales = vec4(t_weight_scales[out_block_idx.data.z]);
-
   const int Kw4 = div_up_4(conv2d_params.kernel_size.x);
 
-  FPOutBlock out_block;
+  // Compute 4 channels for 4 output elements.
+  ivec4 acc0 = ivec4(0);
+  ivec4 acc1 = ivec4(0);
+  ivec4 acc2 = ivec4(0);
+  ivec4 acc3 = ivec4(0);
+
   for (int ky = 0; ky < conv2d_params.kernel_size.y; ky++) {
-    const int out_h = out_block_idx.data.y;
     const int h = out_h * conv2d_params.stride.y - conv2d_params.padding.y +
         ky * conv2d_params.dilation.y;
 
-    InputWindow1D input_window = load_input_window(
-        w_start,
-        w_end,
-        h,
-        out_block_idx.data.z,
-        in_block_extents,
-        input_scale,
-        input_zp,
-        input_zps);
-
-    WeightRow weight_row = load_weight_row(
-        out_block_idx.data.z,
-        ky,
-        out_block_extents.data.z,
-        conv2d_params.kernel_size.x,
-        Kw4,
-        weight_scales);
-
-    perform_conv1d(out_block, input_window, weight_row);
+    for (int kx = 0; kx < conv2d_params.kernel_size.x; kx++) {
+      const int w = out_w * conv2d_params.stride.x - conv2d_params.padding.x +
+          kx * conv2d_params.dilation.x;
+
+      // Load and unpack weights.
+      const int packed_weight_4c = load_weight(
+          kx,
+          ky,
+          out_block_idx.data.z,
+          Kw4,
+          out_block_extents.data.z
+      );
+
+      const ivec4 weight_4c = unpack_int8x4(packed_weight_4c);
+
+      // Load and unpack inputs.
+      int packed_input0 = load_input(
+          w,
+          h,
+          out_block_idx.data.z,
+          out_block_extents.data.z,
+          in_block_extents);
+
+      // Compute weight * input for all 4 accumulators.
+      ivec4 input0 = unpack_int8x4(packed_input0);
+      acc0 += weight_4c * input0;
+
+      int packed_input1 = load_input(
+          w + conv2d_params.stride.x,
+          h,
+          out_block_idx.data.z,
+          out_block_extents.data.z,
+          in_block_extents);
+
+      ivec4 input1 = unpack_int8x4(packed_input1);
+      acc1 += weight_4c * input1;
+
+      int packed_input2 = load_input(
+          w + conv2d_params.stride.x * 2,
+          h,
+          out_block_idx.data.z,
+          out_block_extents.data.z,
+          in_block_extents);
+
+      ivec4 input2 = unpack_int8x4(packed_input2);
+      acc2 += weight_4c * input2;
+
+      int packed_input3 = load_input(
+          w + conv2d_params.stride.x * 3,
+          h,
+          out_block_idx.data.z,
+          out_block_extents.data.z,
+          in_block_extents);
+
+      ivec4 input3 = unpack_int8x4(packed_input3);
+      acc3 += weight_4c * input3;
+    }
   }
 
+  // Apply input zero point as weight_sum * input_zp.
+  vec4 weight_sums = vec4(t_weight_sums[out_block_idx.data.z]);
+  const vec4 weight_scales = vec4(t_weight_scales[out_block_idx.data.z]);
+
+  vec4 facc0 = vec4(acc0);
+  facc0 -= weight_sums * input_zp;
+  facc0 *= weight_scales * input_scale;
+
+  vec4 facc1 = vec4(acc1);
+  facc1 -= weight_sums * input_zp;
+  facc1 *= weight_scales * input_scale;
+
+  vec4 facc2 = vec4(acc2);
+  facc2 -= weight_sums * input_zp;
+  facc2 *= weight_scales * input_scale;
+
+  vec4 facc3 = vec4(acc3);
+  facc3 -= weight_sums * input_zp;
+  facc3 *= weight_scales * input_scale;
+
   if (apply_bias > 0) {
     const vec4 bias = vec4(t_bias[out_block_idx.data.z]);
-    for (int row = 0; row < 4; row++) {
-      out_block.data[row] += bias;
-    }
+    facc0 += bias;
+    facc1 += bias;
+    facc2 += bias;
+    facc3 += bias;
   }
 
-  const ivec4 packed_out_block = quantize_and_pack(
-      out_block, output_inv_scale, output_zp);
+  const ivec4 quantized_out0 = clamp(ivec4(round(facc0 * output_inv_scale) + output_zp), -128, 127);
+  const ivec4 quantized_out1 = clamp(ivec4(round(facc1 * output_inv_scale) + output_zp), -128, 127);
+  const ivec4 quantized_out2 = clamp(ivec4(round(facc2 * output_inv_scale) + output_zp), -128, 127);
+  const ivec4 quantized_out3 = clamp(ivec4(round(facc3 * output_inv_scale) + output_zp), -128, 127);
+
+  const int packed_out_subblock0 = pack_into_int32(quantized_out0);
+  const int packed_out_subblock1 = pack_into_int32(quantized_out1);
+  const int packed_out_subblock2 = pack_into_int32(quantized_out2);
+  const int packed_out_subblock3 = pack_into_int32(quantized_out3);
 
 #ifdef PACKED_INT8_OUTPUT_BUFFER
-  t_packed_int8_output[tid] = packed_out_block;
+  t_packed_int8_output[tid * 4] = packed_out_subblock0;
+  t_packed_int8_output[tid * 4 + 1] = packed_out_subblock1;
+  t_packed_int8_output[tid * 4 + 2] = packed_out_subblock2;
+  t_packed_int8_output[tid * 4 + 3] = packed_out_subblock3;
 #else
-  imageStore(t_packed_int8_output, out_block_idx.data, packed_out_block);
+  #error Unimplemented
 #endif
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8ta_q8csw_q8to.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8ta_q8csw_q8to.yaml
@@ -13,7 +13,7 @@ conv2d_dw_q8ta_q8csw_q8to:
     combination:
       parameter_names: [IO_STORAGE, WEIGHT_STORAGE]
       combos:
-        - parameter_values: [buffer, texture2d]
+        - parameter_values: [buffer, buffer]
     DTYPE:
       - VALUE: float
   shader_variants:
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp
@@ -527,7 +527,7 @@ ValueRef prepack_quantized_conv2d_dw_weight(
 
   std::vector<int64_t> packed_weight_sizes{output_height, output_width};
 
-  utils::StorageType storage_type = utils::kTexture2D;
+  utils::StorageType storage_type = utils::kBuffer;
   uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
   if (output_width > max_extent * 4 || output_height > max_extent) {
     storage_type = utils::kBuffer;
diff --git a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp b/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp