pytorch
diff --git a/‎backends/qualcomm/runtime/SharedBuffer.cpp‎
Lines changed: 1 addition & 1 deletion b/‎backends/qualcomm/runtime/SharedBuffer.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/qualcomm/runtime/backends/QnnBackendFactory.cpp‎
Lines changed: 3 additions & 1 deletion b/‎backends/qualcomm/runtime/backends/QnnBackendFactory.cpp‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎backends/qualcomm/runtime/backends/QnnMemManager.cpp‎
Lines changed: 11 additions & 6 deletions b/‎backends/qualcomm/runtime/backends/QnnMemManager.cpp‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎backends/qualcomm/runtime/backends/QnnMemManager.h‎
Lines changed: 6 additions & 2 deletions b/‎backends/qualcomm/runtime/backends/QnnMemManager.h‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎backends/qualcomm/tests/test_qnn_delegate.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/qualcomm/tests/test_qnn_delegate.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl‎
Lines changed: 36 additions & 14 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl‎
Lines changed: 36 additions & 14 deletions
diff --git a/‎backends/xnnpack/operators/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/xnnpack/operators/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/xnnpack/operators/op_gelu.py‎
Lines changed: 52 additions & 0 deletions b/‎backends/xnnpack/operators/op_gelu.py‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎backends/xnnpack/partition/config/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/xnnpack/partition/config/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/xnnpack/partition/config/generic_node_configs.py‎
Lines changed: 7 additions & 0 deletions b/‎backends/xnnpack/partition/config/generic_node_configs.py‎
Lines changed: 7 additions & 0 deletions
@@ -22,7 +22,7 @@ std::size_t std::hash<CustomMemTensorInfo>::operator()(
   hash_val ^= std::hash<size_t>()(info.pos);
   hash_val ^= std::hash<size_t>()(info.tensor_bytes);
   for (int i = 0; i < info.rank; ++i) {
-    hash_val ^= info.shape[i];
+    hash_val ^= std::hash<uint32_t>()(info.shape[i]);
   }
   hash_val ^= std::hash<uint32_t>()(info.rank);
   hash_val ^= std::hash<executorch::aten::ScalarType>()(info.dtype);
 
@@ -80,7 +80,9 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
           options->soc_info(),
           htp_options);
       backend_params->qnn_mem_manager_ptr_ = std::make_unique<QnnMemManager>(
-          implementation, backend_params->qnn_context_ptr_.get());
+          implementation,
+          backend_params->qnn_context_ptr_.get(),
+          options->log_level());
       backend_params->backend_init_state_ = BackendInitializeState::INITIALIZED;
     } break;
     case QnnExecuTorchBackendType::kGpuBackend:
 
@@ -47,9 +47,12 @@ Error QnnMemManager::RegisterIonMem(
   }
   tensor_wrapper->SetMemHandle(handle);
   registered_map_.insert({handle, mem_ptr});
-  QNN_EXECUTORCH_LOG_INFO(
-      "Tensor %s is successfully registered to ION shared memory.",
-      tensor_wrapper->GetName().c_str());
+  if (log_level_ >= QnnExecuTorchLogLevel::kLogLevelInfo) {
+    QNN_EXECUTORCH_LOG_INFO(
+        "Tensor %s is successfully registered to ION shared memory.",
+        tensor_wrapper->GetName().c_str());
+  }
+
   return Error::Ok;
 }
 
@@ -92,9 +95,11 @@ Error QnnMemManager::RegisterCustomMem(
   }
   tensor_wrapper->SetMemHandle(handle);
   registered_map_.insert({handle, mem_ptr});
-  QNN_EXECUTORCH_LOG_INFO(
-      "Tensor %s is successfully registered to custom shared memory.",
-      tensor_wrapper->GetName().c_str());
+  if (log_level_ >= QnnExecuTorchLogLevel::kLogLevelInfo) {
+    QNN_EXECUTORCH_LOG_INFO(
+        "Tensor %s is successfully registered to custom shared memory.",
+        tensor_wrapper->GetName().c_str());
+  }
   return Error::Ok;
 }
 
 
@@ -21,8 +21,11 @@ class QnnMemManager {
  public:
   explicit QnnMemManager(
       const QnnImplementation& implementation,
-      QnnContext* context)
-      : implementation_(implementation), context_(context) {}
+      QnnContext* context,
+      QnnExecuTorchLogLevel log_level)
+      : implementation_(implementation),
+        context_(context),
+        log_level_(log_level) {}
   ~QnnMemManager() {
     DeRegisterMem();
   }
@@ -63,6 +66,7 @@ class QnnMemManager {
 
   const QnnImplementation& implementation_;
   QnnContext* context_;
+  QnnExecuTorchLogLevel log_level_;
   std::unordered_map<Qnn_MemHandle_t, void*> registered_map_;
   std::unordered_map<CustomMemTensorInfo, void*> pre_registered_handles_;
   std::unordered_map<executorch::aten::ScalarType, Qnn_DataType_t>
 
@@ -3681,7 +3681,7 @@ def test_llama3_2_1b(self):
         if self.pre_gen_pte:
             cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
 
-        golden_start_with = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>"
+        golden_start_with = "<|start_header_id|>user<|end_header_id|>"
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
             conn = listener.accept()
 
@@ -88,10 +88,18 @@ void main() {
     ipos[i] = pos[i] * stride - padding;
   }
 
-  vec4 sum[TILE_SIZE_X * TILE_SIZE_Y];
-  sum[0] = texelFetch(t_bias, ivec2(gpos.z, 0), 0);
-  for (int i = 1; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
-    sum[i] = sum[0];
+  // Final output array where each element is a tensor value.
+  // Tuple of consecutive 4 elements represents a single output texel.
+  float sum[TILE_SIZE_X * TILE_SIZE_Y * 4];
+
+  const vec4 bias = texelFetch(t_bias, ivec2(gpos.z, 0), 0);
+
+  // Initialize the output array with the bias value
+  for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y * 4; i += 4) {
+    sum[i] = bias.x;
+    sum[i + 1] = bias.y;
+    sum[i + 2] = bias.z;
+    sum[i + 3] = bias.w;
   }
 
   int z4 = 0;
@@ -100,14 +108,26 @@ void main() {
     // During prepacking, the weight tensor has been permuted so that the
     // channel (IC) dim is along the x-axis, and the batch (OC) dim is along
     // the z-axis.
-    const vec4 ktex_0 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(0, 0));
-    const vec4 ktex_1 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(1, 0));
-    const vec4 ktex_2 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(2, 0));
-    const vec4 ktex_3 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(3, 0));
+    float kernel_values[4 * 4]; // 4 channels, 4 elements per channel
+
+    // Load kernel values from texels to array
+    for (int i = 0; i < 4; ++i) {
+      const vec4 k_tex = texelFetch(t_kernel, ivec2(z + i, gpos.z), 0);
+      kernel_values[i * 4 + 0] = k_tex.x;
+      kernel_values[i * 4 + 1] = k_tex.y;
+      kernel_values[i * 4 + 2] = k_tex.z;
+      kernel_values[i * 4 + 3] = k_tex.w;
+    }
 
-#pragma unroll
     for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
       const vec4 in_tex = texelFetch(t_in, ivec3(ipos[i], z4), 0);
+      // Load the input texel into an array
+      float tex_values[4];
+      tex_values[0] = in_tex.x;
+      tex_values[1] = in_tex.y;
+      tex_values[2] = in_tex.z;
+      tex_values[3] = in_tex.w;
+
       // For 2x2 tile size algorithm works as follows.
       // To explain the calculations below, the contents of one in_tex and the
       // group of 4 texels loaded from t_kernel are shown:
@@ -141,18 +161,20 @@ void main() {
       //
       //  which is what is expressed in the following calculations. This is done
       //  for each output position.
-      sum[i] = fma(in_tex.xxxx, ktex_0, sum[i]);
-      sum[i] = fma(in_tex.yyyy, ktex_1, sum[i]);
-      sum[i] = fma(in_tex.zzzz, ktex_2, sum[i]);
-      sum[i] = fma(in_tex.wwww, ktex_3, sum[i]);
+      for (int j = 0; j < 4; ++j) {
+        sum[i * 4 + j] = tex_values[0] * kernel_values[0 + j] + sum[i * 4 + j];
+        sum[i * 4 + j] = tex_values[1] * kernel_values[4 + j] + sum[i * 4 + j];
+        sum[i * 4 + j] = tex_values[2] * kernel_values[8 + j] + sum[i * 4 + j];
+        sum[i * 4 + j] = tex_values[3] * kernel_values[12 + j] + sum[i * 4 + j];
+      }
     }
   }
 
   for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
     const uint index = (shared_mem_stride * i) + gl_LocalInvocationIndex;
     const ivec3 pos = pos_shared[offset_pos_index(index)];
     if (all(lessThan(pos, out_limits.xyz))) {
-      imageStore(t_out, pos, op(sum[i], out_min, out_max));
+      imageStore(t_out, pos, op(vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]), out_min, out_max));
     }
   }
 }
@@ -20,6 +20,7 @@
     op_dynamic_quantize_ops,
     op_elu,
     op_floor,
+    op_gelu,
     op_hardswish,
     op_hardtanh,
     op_leaky_relu,
 
@@ -0,0 +1,52 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import torch
+from executorch.backends.xnnpack.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
+    XNNGelu,
+    XNNGraph,
+    XNode,
+)
+from executorch.backends.xnnpack.utils.utils import get_input_node
+
+
+@register_node_visitor
+class GeluVisitor(NodeVisitor):
+    target = "aten.gelu.default"
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        xnn_graph: XNNGraph,
+        vals_to_ids: Dict[torch.fx.Node, int],
+        debug_handle: int,
+    ) -> None:
+        self.define_nodes_tensor_inputs_outputs(node, xnn_graph, vals_to_ids)
+
+        # input
+        input_id = vals_to_ids[get_input_node(node, 0)]
+
+        # output
+        output_id = vals_to_ids[node]
+
+        ser_node = XNode(
+            xnode_union=XNNGelu(
+                input_id=input_id,
+                output_id=output_id,
+                flags=0,
+            ),
+            debug_handle=debug_handle,
+        )
+        xnn_graph.xnodes.append(ser_node)
@@ -26,6 +26,7 @@
     DeQuantizedPerTensorConfig,
     DivConfig,
     FloorConfig,
+    GeluConfig,
     HardswishConfig,
     # EluConfig,
     HardtanhConfig,
@@ -79,6 +80,7 @@
     DivConfig,
     # EluConfig, # Waiting for PyTorch Pin Update
     FloorConfig,
+    GeluConfig,
     HardtanhConfig,
     HardswishConfig,
     LeakyReLUConfig,
 
@@ -343,6 +343,13 @@ def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32]
 
 
+class GeluConfig(GenericNodePartitionerConfig):
+    target_name = "gelu.default"
+
+    def supported_precision_types(self) -> List[ConfigPrecisionType]:
+        return [ConfigPrecisionType.FP32]
+
+
 class HardswishConfig(GenericNodePartitionerConfig):
     target_name = "hardswish.default"
Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@ std::size_t std::hash<CustomMemTensorInfo>::operator()(`
`22`	`22`	`hash_val ^= std::hash<size_t>()(info.pos);`
`23`	`23`	`hash_val ^= std::hash<size_t>()(info.tensor_bytes);`
`24`	`24`	`for (int i = 0; i < info.rank; ++i) {`
`25`		`- hash_val ^= info.shape[i];`
	`25`	`+ hash_val ^= std::hash<uint32_t>()(info.shape[i]);`
`26`	`26`	`}`
`27`	`27`	`hash_val ^= std::hash<uint32_t>()(info.rank);`
`28`	`28`	`hash_val ^= std::hash<executorch::aten::ScalarType>()(info.dtype);`