pytorch
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/common.glslh‎
Lines changed: 24 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/common.glslh‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_common.glslh‎
Lines changed: 42 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_common.glslh‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_activation_block.glslh‎
Lines changed: 37 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_activation_block.glslh‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_input_block_load.glslh‎
Lines changed: 30 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_input_block_load.glslh‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_output_block_store.glslh‎
Lines changed: 32 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_output_block_store.glslh‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_activation_block.glslh‎
Lines changed: 86 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_activation_block.glslh‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_input_block_store.glslh‎
Lines changed: 30 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_input_block_store.glslh‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_output_block_load.glslh‎
Lines changed: 29 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_output_block_load.glslh‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/linear_common.glslh‎
Lines changed: 0 additions & 13 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/linear_common.glslh‎
Lines changed: 0 additions & 13 deletions
@@ -33,6 +33,30 @@ struct TensorIndex4D {
   ivec4 data;
 };
 
+int sign_extend_8bit(const int val) {
+  if ((val & 0x80) != 0) {
+    return val | (~0xFF);
+  }
+  return val;
+}
+
+int extract_8bit_from_packed_int_le(const int packed, const int i) {
+  // account for little endian
+  int byte = sign_extend_8bit(packed >> (8 * i) & 0xFF);
+  return byte;
+}
+
+int pack_4xqint_into_int32(
+    const int val0,
+    const int val1,
+    const int val2,
+    const int val3) {
+  int packed = (val0 & 0xFF) | ((val1 & 0xFF) << 8) | ((val2 & 0xFF) << 16) |
+      ((val3 & 0xFF) << 24);
+
+  return packed;
+}
+
 #ifdef DEBUG_MODE
 
 #extension GL_EXT_debug_printf : require
 
@@ -27,6 +27,48 @@ struct Conv2DParams {
   int K4;
 };
 
+struct Conv2dTensorIndex {
+  ivec3 data;
+  int texel_i;
+};
+
+struct Conv2dBlockIndex {
+  ivec3 data;
+};
+
+Conv2dTensorIndex block_idx_to_tensor_idx(const Conv2dBlockIndex block_idx) {
+  Conv2dTensorIndex tensor_idx;
+  tensor_idx.data.x = mul_4(block_idx.data.x);
+  tensor_idx.data.y = block_idx.data.y;
+  tensor_idx.data.z = block_idx.data.z;
+  tensor_idx.texel_i = 0;
+  return tensor_idx;
+}
+
+struct Conv2dBlockExtents {
+  ivec3 data;
+  int data_xz;
+};
+
+Conv2dBlockExtents make_block_extents(const ivec4 tensor_sizes) {
+  Conv2dBlockExtents block_sizes;
+  block_sizes.data.x = div_up_4(tensor_sizes.x);
+  block_sizes.data.y = tensor_sizes.y;
+  block_sizes.data.z = div_up_4(tensor_sizes.z);
+
+  block_sizes.data_xz = block_sizes.data.x * block_sizes.data.z;
+
+  return block_sizes;
+}
+
+bool block_idx_out_of_bounds(
+    const Conv2dBlockIndex block_idx,
+    const Conv2dBlockExtents block_extents) {
+  return block_idx.data.x >= block_extents.data.x ||
+      block_idx.data.y >= block_extents.data.y ||
+      block_idx.data.z >= block_extents.data.z;
+}
+
 #ifdef DEBUG_MODE
 
 void printConv2DParams(const Conv2DParams params) {
 
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef CONV2D_FP_ACTIVATION_BLOCK
+#define CONV2D_FP_ACTIVATION_BLOCK
+
+#extension GL_EXT_control_flow_attributes : require
+
+struct FPActivationBlock {
+  VEC4_T data[4];
+};
+
+#ifdef DEBUG_MODE
+
+#extension GL_EXT_debug_printf : require
+
+void printFPActivationBlock(const FPActivationBlock activation_block) {
+  debugPrintfEXT("fp activation_block: \\n");
+  [[unroll]] for (int w = 0; w < 4; ++w) {
+    debugPrintfEXT(
+        "  [%d]: %f, %f, %f, %f, \\n",
+        w,
+        activation_block.data[w].x,
+        activation_block.data[w].y,
+        activation_block.data[w].z,
+        activation_block.data[w].w);
+  }
+}
+
+#endif // DEBUG_MODE
+
+#endif // CONV2D_FP_ACTIVATION_BLOCK
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef CONV2D_FP_ACTIVATION_BLOCK_LOAD
+#define CONV2D_FP_ACTIVATION_BLOCK_LOAD
+
+#extension GL_EXT_control_flow_attributes : require
+
+#include "conv2d_fp_activation_block.glslh"
+
+VEC4_T load_fp_input_texel(const Conv2dTensorIndex tidx) {
+  return texelFetch(t_fp_input, tidx.data, 0);
+}
+
+void load_fp_input_block(
+    out FPActivationBlock block,
+    const Conv2dBlockIndex block_idx) {
+  Conv2dTensorIndex load_tidx = block_idx_to_tensor_idx(block_idx);
+  [[unroll]] for (int w = 0; w < 4; w++) {
+    block.data[w] = load_fp_input_texel(load_tidx);
+    load_tidx.data.x++;
+  }
+}
+
+#endif // CONV2D_FP_ACTIVATION_BLOCK_LOAD
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef CONV2D_FP_ACTIVATION_BLOCK_STORE
+#define CONV2D_FP_ACTIVATION_BLOCK_STORE
+
+#extension GL_EXT_control_flow_attributes : require
+
+#include "conv2d_fp_activation_block.glslh"
+
+void store_fp_output_texel(
+    const Conv2dTensorIndex tidx,
+    const VEC4_T out_texel) {
+  imageStore(t_fp_output, tidx.data, out_texel);
+}
+
+void store_fp_activation_block(
+    const FPActivationBlock block,
+    const Conv2dBlockIndex block_idx) {
+  Conv2dTensorIndex store_tidx = block_idx_to_tensor_idx(block_idx);
+  [[unroll]] for (int w = 0; w < 4; w++) {
+    store_fp_output_texel(store_tidx, block.data[w]);
+    store_tidx.data.x++;
+  }
+}
+
+#endif // CONV2D_FP_ACTIVATION_BLOCK_STORE
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef CONV2D_INT8_ACTIVATION_BLOCK
+#define CONV2D_INT8_ACTIVATION_BLOCK
+
+#extension GL_EXT_control_flow_attributes : require
+
+#include "common.glslh"
+#include "conv2d_fp_activation_block.glslh"
+
+struct Int8ActivationBlock {
+  ivec4 data;
+};
+
+ivec4 quantize_to_8bit(
+    const VEC4_T val,
+    const float q_inv_scale,
+    const int q_zero_point) {
+  vec4 quantized = round(vec4(val) * q_inv_scale) + q_zero_point;
+  return clamp(ivec4(quantized), -128, 127);
+}
+
+void quantize_fp_activation_block(
+    out Int8ActivationBlock out_block,
+    const FPActivationBlock in_block,
+    const float q_inv_scale,
+    const int q_zero_point) {
+  [[unroll]] for (int w = 0; w < 4; ++w) {
+    ivec4 quantized_texel =
+        quantize_to_8bit(in_block.data[w], q_inv_scale, q_zero_point);
+    out_block.data[w] = pack_4xqint_into_int32(
+        quantized_texel[0],
+        quantized_texel[1],
+        quantized_texel[2],
+        quantized_texel[3]);
+  }
+}
+
+VEC4_T
+dequantize_8bit(const ivec4 val, const float q_scale, const int q_zero_point) {
+  return VEC4_T(val - q_zero_point) * q_scale;
+}
+
+void dequantize_int8_activation_block(
+    out FPActivationBlock out_block,
+    const Int8ActivationBlock in_block,
+    const float q_scale,
+    const int q_zero_point) {
+  [[unroll]] for (int w = 0; w < 4; ++w) {
+    int packed = in_block.data[w];
+    out_block.data[w] = dequantize_8bit(
+        ivec4(
+            extract_8bit_from_packed_int_le(packed, 0),
+            extract_8bit_from_packed_int_le(packed, 1),
+            extract_8bit_from_packed_int_le(packed, 2),
+            extract_8bit_from_packed_int_le(packed, 3)),
+        q_scale,
+        q_zero_point);
+  }
+}
+
+#ifdef DEBUG_MODE
+
+void printInt8ActivationBlock(const Int8ActivationBlock block) {
+  debugPrintfEXT("Int8ActivationBlock: \\n");
+  [[unroll]] for (int row = 0; row < 4; ++row) {
+    int packed = block.data[row];
+    debugPrintfEXT(
+        "  [%d]: %d, %d, %d, %d, \\n",
+        row,
+        extract_8bit_from_packed_int_le(packed, 0),
+        extract_8bit_from_packed_int_le(packed, 1),
+        extract_8bit_from_packed_int_le(packed, 2),
+        extract_8bit_from_packed_int_le(packed, 3));
+  }
+}
+
+#endif // DEBUG_MODE
+
+#endif // CONV2D_INT8_ACTIVATION_BLOCK
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef CONV2D_INT8_INPUT_BLOCK_STORE
+#define CONV2D_INT8_INPUT_BLOCK_STORE
+
+#extension GL_EXT_control_flow_attributes : require
+
+#include "conv2d_common.glslh"
+#include "conv2d_int8_activation_block.glslh"
+
+void store_packed_int8_input_block(
+    const Conv2dBlockIndex block_idx,
+    const Conv2dBlockExtents block_extents,
+    const Int8ActivationBlock packed_int8_block) {
+#ifdef OUTPUT_BUFFER
+  const int buffer_idx = block_idx.data.y * block_extents.data_xz +
+      block_idx.data.x * block_extents.data.z + block_idx.data.z;
+  t_packed_int8_input[buffer_idx] = packed_int8_block.data;
+#else
+  imageStore(t_packed_int8_input, block_idx.data, packed_int8_block.data);
+#endif
+}
+
+#endif // CONV2D_INT8_INPUT_BLOCK_STORE
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef CONV2D_INT8_OUTPUT_BLOCK_STORE
+#define CONV2D_INT8_OUTPUT_BLOCK_STORE
+
+#extension GL_EXT_control_flow_attributes : require
+
+#include "conv2d_common.glslh"
+#include "conv2d_int8_activation_block.glslh"
+
+ivec4 load_packed_int8_output_block(
+    const Conv2dBlockIndex block_idx,
+    const Conv2dBlockExtents block_extents) {
+#ifdef INPUT_BUFFER
+  const int buffer_idx = block_idx.data.y * block_extents.data_xz +
+      block_idx.data.x * block_extents.data.z + block_idx.data.z;
+  return t_packed_int8_output[buffer_idx];
+#else
+  return texelFetch(t_packed_int8_output, block_idx.data, 0);
+#endif
+}
+
+#endif // CONV2D_INT8_OUTPUT_BLOCK_STORE
@@ -16,19 +16,6 @@
 
 #include "common.glslh"
 
-int sign_extend_8bit(const int val) {
-  if ((val & 0x80) != 0) {
-    return val | (~0xFF);
-  }
-  return val;
-}
-
-int extract_8bit_from_packed_int_le(const int packed, const int i) {
-  // account for little endian
-  int byte = sign_extend_8bit(packed >> (8 * i) & 0xFF);
-  return byte;
-}
-
 // Extract a 4-bit value from a packed int (little endian)
 // It is assumed that the 4-bit value is in the range [0, 15]
 int extract_4bit_from_packed_int_le(const int packed, const int col) {