Skip to content

Commit e05b32f

Browse files
authored
Merge branch 'main' into migrate-pt2e-arm
2 parents a2a4286 + 1a27c14 commit e05b32f

39 files changed

+2187
-2394
lines changed

backends/qualcomm/runtime/SharedBuffer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ std::size_t std::hash<CustomMemTensorInfo>::operator()(
2222
hash_val ^= std::hash<size_t>()(info.pos);
2323
hash_val ^= std::hash<size_t>()(info.tensor_bytes);
2424
for (int i = 0; i < info.rank; ++i) {
25-
hash_val ^= info.shape[i];
25+
hash_val ^= std::hash<uint32_t>()(info.shape[i]);
2626
}
2727
hash_val ^= std::hash<uint32_t>()(info.rank);
2828
hash_val ^= std::hash<executorch::aten::ScalarType>()(info.dtype);

backends/qualcomm/runtime/backends/QnnBackendFactory.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,9 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
8080
options->soc_info(),
8181
htp_options);
8282
backend_params->qnn_mem_manager_ptr_ = std::make_unique<QnnMemManager>(
83-
implementation, backend_params->qnn_context_ptr_.get());
83+
implementation,
84+
backend_params->qnn_context_ptr_.get(),
85+
options->log_level());
8486
backend_params->backend_init_state_ = BackendInitializeState::INITIALIZED;
8587
} break;
8688
case QnnExecuTorchBackendType::kGpuBackend:

backends/qualcomm/runtime/backends/QnnMemManager.cpp

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,12 @@ Error QnnMemManager::RegisterIonMem(
4747
}
4848
tensor_wrapper->SetMemHandle(handle);
4949
registered_map_.insert({handle, mem_ptr});
50-
QNN_EXECUTORCH_LOG_INFO(
51-
"Tensor %s is successfully registered to ION shared memory.",
52-
tensor_wrapper->GetName().c_str());
50+
if (log_level_ >= QnnExecuTorchLogLevel::kLogLevelInfo) {
51+
QNN_EXECUTORCH_LOG_INFO(
52+
"Tensor %s is successfully registered to ION shared memory.",
53+
tensor_wrapper->GetName().c_str());
54+
}
55+
5356
return Error::Ok;
5457
}
5558

@@ -92,9 +95,11 @@ Error QnnMemManager::RegisterCustomMem(
9295
}
9396
tensor_wrapper->SetMemHandle(handle);
9497
registered_map_.insert({handle, mem_ptr});
95-
QNN_EXECUTORCH_LOG_INFO(
96-
"Tensor %s is successfully registered to custom shared memory.",
97-
tensor_wrapper->GetName().c_str());
98+
if (log_level_ >= QnnExecuTorchLogLevel::kLogLevelInfo) {
99+
QNN_EXECUTORCH_LOG_INFO(
100+
"Tensor %s is successfully registered to custom shared memory.",
101+
tensor_wrapper->GetName().c_str());
102+
}
98103
return Error::Ok;
99104
}
100105

backends/qualcomm/runtime/backends/QnnMemManager.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,11 @@ class QnnMemManager {
2121
public:
2222
explicit QnnMemManager(
2323
const QnnImplementation& implementation,
24-
QnnContext* context)
25-
: implementation_(implementation), context_(context) {}
24+
QnnContext* context,
25+
QnnExecuTorchLogLevel log_level)
26+
: implementation_(implementation),
27+
context_(context),
28+
log_level_(log_level) {}
2629
~QnnMemManager() {
2730
DeRegisterMem();
2831
}
@@ -63,6 +66,7 @@ class QnnMemManager {
6366

6467
const QnnImplementation& implementation_;
6568
QnnContext* context_;
69+
QnnExecuTorchLogLevel log_level_;
6670
std::unordered_map<Qnn_MemHandle_t, void*> registered_map_;
6771
std::unordered_map<CustomMemTensorInfo, void*> pre_registered_handles_;
6872
std::unordered_map<executorch::aten::ScalarType, Qnn_DataType_t>

backends/qualcomm/tests/test_qnn_delegate.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3681,7 +3681,7 @@ def test_llama3_2_1b(self):
36813681
if self.pre_gen_pte:
36823682
cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
36833683

3684-
golden_start_with = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>"
3684+
golden_start_with = "<|start_header_id|>user<|end_header_id|>"
36853685
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
36863686
with Listener((self.ip, self.port)) as listener:
36873687
conn = listener.accept()

backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -88,10 +88,18 @@ void main() {
8888
ipos[i] = pos[i] * stride - padding;
8989
}
9090

91-
vec4 sum[TILE_SIZE_X * TILE_SIZE_Y];
92-
sum[0] = texelFetch(t_bias, ivec2(gpos.z, 0), 0);
93-
for (int i = 1; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
94-
sum[i] = sum[0];
91+
// Final output array where each element is a tensor value.
92+
// Tuple of consecutive 4 elements represents a single output texel.
93+
float sum[TILE_SIZE_X * TILE_SIZE_Y * 4];
94+
95+
const vec4 bias = texelFetch(t_bias, ivec2(gpos.z, 0), 0);
96+
97+
// Initialize the output array with the bias value
98+
for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y * 4; i += 4) {
99+
sum[i] = bias.x;
100+
sum[i + 1] = bias.y;
101+
sum[i + 2] = bias.z;
102+
sum[i + 3] = bias.w;
95103
}
96104

97105
int z4 = 0;
@@ -100,14 +108,26 @@ void main() {
100108
// During prepacking, the weight tensor has been permuted so that the
101109
// channel (IC) dim is along the x-axis, and the batch (OC) dim is along
102110
// the z-axis.
103-
const vec4 ktex_0 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(0, 0));
104-
const vec4 ktex_1 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(1, 0));
105-
const vec4 ktex_2 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(2, 0));
106-
const vec4 ktex_3 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(3, 0));
111+
float kernel_values[4 * 4]; // 4 channels, 4 elements per channel
112+
113+
// Load kernel values from texels to array
114+
for (int i = 0; i < 4; ++i) {
115+
const vec4 k_tex = texelFetch(t_kernel, ivec2(z + i, gpos.z), 0);
116+
kernel_values[i * 4 + 0] = k_tex.x;
117+
kernel_values[i * 4 + 1] = k_tex.y;
118+
kernel_values[i * 4 + 2] = k_tex.z;
119+
kernel_values[i * 4 + 3] = k_tex.w;
120+
}
107121

108-
#pragma unroll
109122
for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
110123
const vec4 in_tex = texelFetch(t_in, ivec3(ipos[i], z4), 0);
124+
// Load the input texel into an array
125+
float tex_values[4];
126+
tex_values[0] = in_tex.x;
127+
tex_values[1] = in_tex.y;
128+
tex_values[2] = in_tex.z;
129+
tex_values[3] = in_tex.w;
130+
111131
// For 2x2 tile size algorithm works as follows.
112132
// To explain the calculations below, the contents of one in_tex and the
113133
// group of 4 texels loaded from t_kernel are shown:
@@ -141,18 +161,20 @@ void main() {
141161
//
142162
// which is what is expressed in the following calculations. This is done
143163
// for each output position.
144-
sum[i] = fma(in_tex.xxxx, ktex_0, sum[i]);
145-
sum[i] = fma(in_tex.yyyy, ktex_1, sum[i]);
146-
sum[i] = fma(in_tex.zzzz, ktex_2, sum[i]);
147-
sum[i] = fma(in_tex.wwww, ktex_3, sum[i]);
164+
for (int j = 0; j < 4; ++j) {
165+
sum[i * 4 + j] = tex_values[0] * kernel_values[0 + j] + sum[i * 4 + j];
166+
sum[i * 4 + j] = tex_values[1] * kernel_values[4 + j] + sum[i * 4 + j];
167+
sum[i * 4 + j] = tex_values[2] * kernel_values[8 + j] + sum[i * 4 + j];
168+
sum[i * 4 + j] = tex_values[3] * kernel_values[12 + j] + sum[i * 4 + j];
169+
}
148170
}
149171
}
150172

151173
for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
152174
const uint index = (shared_mem_stride * i) + gl_LocalInvocationIndex;
153175
const ivec3 pos = pos_shared[offset_pos_index(index)];
154176
if (all(lessThan(pos, out_limits.xyz))) {
155-
imageStore(t_out, pos, op(sum[i], out_min, out_max));
177+
imageStore(t_out, pos, op(vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]), out_min, out_max));
156178
}
157179
}
158180
}

backends/xnnpack/operators/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
op_dynamic_quantize_ops,
2121
op_elu,
2222
op_floor,
23+
op_gelu,
2324
op_hardswish,
2425
op_hardtanh,
2526
op_leaky_relu,
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
from typing import Dict
8+
9+
import torch
10+
from executorch.backends.xnnpack.operators.node_visitor import (
11+
NodeVisitor,
12+
register_node_visitor,
13+
)
14+
from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
15+
XNNGelu,
16+
XNNGraph,
17+
XNode,
18+
)
19+
from executorch.backends.xnnpack.utils.utils import get_input_node
20+
21+
22+
@register_node_visitor
23+
class GeluVisitor(NodeVisitor):
24+
target = "aten.gelu.default"
25+
26+
def __init__(self, *args) -> None:
27+
super().__init__(*args)
28+
29+
def define_node(
30+
self,
31+
node: torch.fx.Node,
32+
xnn_graph: XNNGraph,
33+
vals_to_ids: Dict[torch.fx.Node, int],
34+
debug_handle: int,
35+
) -> None:
36+
self.define_nodes_tensor_inputs_outputs(node, xnn_graph, vals_to_ids)
37+
38+
# input
39+
input_id = vals_to_ids[get_input_node(node, 0)]
40+
41+
# output
42+
output_id = vals_to_ids[node]
43+
44+
ser_node = XNode(
45+
xnode_union=XNNGelu(
46+
input_id=input_id,
47+
output_id=output_id,
48+
flags=0,
49+
),
50+
debug_handle=debug_handle,
51+
)
52+
xnn_graph.xnodes.append(ser_node)

backends/xnnpack/partition/config/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
DeQuantizedPerTensorConfig,
2727
DivConfig,
2828
FloorConfig,
29+
GeluConfig,
2930
HardswishConfig,
3031
# EluConfig,
3132
HardtanhConfig,
@@ -79,6 +80,7 @@
7980
DivConfig,
8081
# EluConfig, # Waiting for PyTorch Pin Update
8182
FloorConfig,
83+
GeluConfig,
8284
HardtanhConfig,
8385
HardswishConfig,
8486
LeakyReLUConfig,

backends/xnnpack/partition/config/generic_node_configs.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,13 @@ def supported_precision_types(self) -> List[ConfigPrecisionType]:
343343
return [ConfigPrecisionType.FP32]
344344

345345

346+
class GeluConfig(GenericNodePartitionerConfig):
347+
target_name = "gelu.default"
348+
349+
def supported_precision_types(self) -> List[ConfigPrecisionType]:
350+
return [ConfigPrecisionType.FP32]
351+
352+
346353
class HardswishConfig(GenericNodePartitionerConfig):
347354
target_name = "hardswish.default"
348355

0 commit comments

Comments
 (0)