Skip to content

Commit 73bd4bc

Browse files
committed
Merge branch 'main' into private-s22-devices
2 parents f1266f8 + 9a8fcba commit 73bd4bc

File tree

18 files changed

+195
-57
lines changed

18 files changed

+195
-57
lines changed

.github/scripts/extract_benchmark_results.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -349,7 +349,10 @@ def transform(
349349
# Overwrite the device name here with the job name as it has more information about
350350
# the device, i.e. Samsung Galaxy S22 5G instead of just Samsung
351351
for r in benchmark_results:
352-
r["deviceInfo"]["device"] = job_name
352+
is_private_device = job_report.get("is_private_instance", False)
353+
r["deviceInfo"]["device"] = (
354+
f"{job_name} (private)" if is_private_device else job_name
355+
)
353356

354357
# From https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
355358
return [
@@ -363,6 +366,7 @@ def transform(
363366
"benchmark_config": json.dumps(benchmark_config),
364367
"job_conclusion": "SUCCESS",
365368
"job_arn": job_report.get("arn", ""),
369+
"instance_arn": job_report.get("instance_arn", ""),
366370
},
367371
},
368372
"model": {

.github/workflows/apple-perf-private-device-experiment.yml

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,16 @@
11
name: apple-perf (private devices)
22

33
on:
4-
# TODO (huydhn): Disable the schedule run until we land the change to add device pool and device name
5-
# to separate between public and private iOS devices
6-
# schedule:
7-
# - cron: 0 0,4,8,12,16,20 * * *
4+
schedule:
5+
- cron: 0 0,4,8,12,16,20 * * *
86
pull_request:
97
paths:
108
- .github/workflows/apple-perf-private-device-experiment.yml
11-
# push:
12-
# branches:
13-
# - main
14-
# paths:
15-
# - .github/workflows/apple-perf-private-device-experiment.yml
9+
push:
10+
branches:
11+
- main
12+
paths:
13+
- .github/workflows/apple-perf-private-device-experiment.yml
1614
# Note: GitHub has an upper limit of 10 inputs
1715
workflow_dispatch:
1816
inputs:

backends/cadence/hifi/operators/op_bmm.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ using exec_aten::ScalarType;
1616
using executorch::runtime::KernelRuntimeContext;
1717
using executorch::runtime::kTensorDimensionLimit;
1818
using executorch::runtime::resize_tensor;
19-
using executorch::runtime::tensors_have_same_dim_order;
2019
using executorch::runtime::tensor_is_default_dim_order;
20+
using executorch::runtime::tensors_have_same_dim_order;
2121
using torch::executor::check_bmm_args;
2222
using torch::executor::Error;
2323
using torch::executor::get_bmm_out_target_size;
@@ -78,16 +78,16 @@ Tensor& bmm_out(
7878
WORD32 out_stride = p;
7979

8080
WORD32* __restrict__ tmp =
81-
(WORD32* __restrict__)kernels::allocate_temp_memory(
82-
ctx, (batch_size * m * p) * sizeof(float));
81+
(WORD32* __restrict__)kernels::allocate_temp_memory(
82+
ctx, (batch_size * m * p) * sizeof(float));
8383

8484
ET_KERNEL_CHECK(ctx, tmp != nullptr, MemoryAllocationFailed, out);
8585

8686
tmp[batch_size * m * p] = {0};
8787

8888
WORD32* __restrict__ p_o =
89-
(WORD32* __restrict__)kernels::allocate_temp_memory(
90-
ctx, (batch_size * m * p) * sizeof(WORD32));
89+
(WORD32* __restrict__)kernels::allocate_temp_memory(
90+
ctx, (batch_size * m * p) * sizeof(WORD32));
9191

9292
ET_KERNEL_CHECK(ctx, p_o != nullptr, MemoryAllocationFailed, out);
9393

backends/cadence/hifi/operators/op_mm.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,8 @@ Tensor& mm_out(
7676
WORD32 out_stride = p;
7777

7878
WORD32* __restrict__ p_o =
79-
(WORD32* __restrict__)kernels::allocate_temp_memory(
80-
ctx, (n * p) * sizeof(WORD32));
79+
(WORD32* __restrict__)kernels::allocate_temp_memory(
80+
ctx, (n * p) * sizeof(WORD32));
8181

8282
WORD32 p_inp_shape[2];
8383
p_inp_shape[0] = n;
@@ -146,4 +146,4 @@ Tensor& mm_out(
146146
} // namespace native
147147
} // namespace HiFi
148148
} // namespace impl
149-
} // namespace cadence
149+
} // namespace cadence

backends/cadence/hifi/operators/targets.bzl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,11 @@ def define_operator(name: str, deps: list[str] | None = None) -> None:
3434
OPERATORS = [
3535
"add",
3636
"atan2",
37+
"bmm",
38+
"mm",
39+
"slice_copy",
40+
"split_with_sizes_copy",
41+
"view_copy",
3742
"cat",
3843
"clamp",
3944
"dequantize_per_tensor",

backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_coop.glsl

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,18 +38,21 @@ layout(push_constant) uniform restrict Block {
3838
ivec4 weight_sizes;
3939
};
4040

41+
#include "indexing_utils.h"
42+
4143
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
4244

4345
shared VEC4_T partial_c[NGROUPS][NWORKERS][TILE_ROWS];
4446

4547
void main() {
46-
const uint out_row = gl_GlobalInvocationID.y * TILE_ROWS;
47-
const uint out_col = gl_GlobalInvocationID.x << 2;
48+
const uint out_width_ntexels = divup4(out_sizes.x);
49+
const uint out_col = (gl_GlobalInvocationID.x % out_width_ntexels) << 2;
50+
const uint out_row = (gl_GlobalInvocationID.x / out_width_ntexels) * TILE_ROWS;
4851

4952
const int gid = int(gl_LocalInvocationID.x); // group id
5053
const int wid = int(gl_LocalInvocationID.z); // worker id
5154

52-
if (out_col >= out_sizes.x || out_row >= out_sizes.y) {
55+
if (out_row >= out_sizes.y) {
5356
return;
5457
}
5558

backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.glsl

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,18 @@ layout(push_constant) uniform restrict Block {
3636
ivec4 weight_sizes;
3737
};
3838

39+
#include "indexing_utils.h"
40+
3941
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
4042

43+
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
44+
4145
void main() {
42-
const uint out_row = gl_GlobalInvocationID.y * TILE_ROWS;
43-
const uint out_col = gl_GlobalInvocationID.x << 2;
46+
const uint16_t out_width_ntexels = uint16_t(divup4(out_sizes.x));
47+
const uint16_t out_col = uint16_t((gl_GlobalInvocationID.x % out_width_ntexels) << 2);
48+
const uint16_t out_row = uint16_t((gl_GlobalInvocationID.x / out_width_ntexels) * TILE_ROWS);
4449

45-
if (out_col >= out_sizes.x || out_row >= out_sizes.y) {
50+
if (out_row >= uint16_t(out_sizes.y)) {
4651
return;
4752
}
4853

@@ -51,29 +56,29 @@ void main() {
5156
VEC4_T c[TILE_ROWS];
5257

5358
$if SCALES_STORAGE == "buffer":
54-
const VEC4_T scales = VEC4_T(t_scales[out_col >> 2]);
59+
const VEC4_T scales = VEC4_T(t_scales[int(out_col >> 2)]);
5560
$else:
56-
const VEC4_T scales = VEC4_T(texelFetch(t_scales, ivec2(out_col >> 2, 0), 0));
61+
const VEC4_T scales = VEC4_T(texelFetch(t_scales, u16vec2(out_col >> 2, 0), 0));
5762

5863
[[unroll]] for (int i = 0; i < TILE_ROWS; ++i) {
5964
c[i] = VEC4_T(0.0);
6065
}
6166

62-
for (int pos = 0; pos < in_sizes.x; pos += 4) {
67+
for (uint16_t pos = uint16_t(0); pos < uint16_t(in_sizes.x); pos += uint16_t(4)) {
6368
// Preload weight tensor
6469
[[unroll]] for (int i = 0; i < 4; i++) {
6570
$if WEIGHT_STORAGE == "buffer":
6671
b[i] = t_weight[((pos + i) * out_sizes.x + out_col) >> 2];
6772
$else:
68-
b[i] = VEC4_T(texelFetch(t_weight, ivec2(out_col >> 2, pos + i), 0));
73+
b[i] = VEC4_T(texelFetch(t_weight, u16vec2(out_col >> 2, pos + i), 0));
6974
}
7075

7176
// Preload input tensor
7277
[[unroll]] for (int i = 0; i < TILE_ROWS; i++) {
7378
$if IN_STORAGE == "buffer":
7479
a[i] = t_in[((out_row + i) * in_sizes.x + pos) >> 2];
7580
$else:
76-
a[i] = VEC4_T(texelFetch(t_in, ivec3(pos >> 2, out_row + i, 0), 0));
81+
a[i] = VEC4_T(texelFetch(t_in, u16vec3(pos >> 2, out_row + i, 0), 0));
7782
}
7883

7984
// Accumulate output

backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@ q_8w_linear_tiled:
1616
TILE_ROWS:
1717
- VALUE: 1
1818
SUFFIX: o4x1
19+
- VALUE: 2
20+
SUFFIX: o4x2
1921
- VALUE: 4
2022
SUFFIX: o4x4
21-
- VALUE: 6
22-
SUFFIX: o4x6
2323
shader_variants:
2424
- NAME: q_8w_linear_tiled_texture3d_texture3d_texture2d_texture2d_float
2525
- NAME: q_8w_linear_tiled_buffer_buffer_texture2d_texture2d_float

backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -180,10 +180,10 @@ void add_q_8w_linear_tiled_node(
180180

181181
std::vector<int64_t> mat1_sizes = graph.sizes_of(mat1);
182182
const int64_t M = utils::val_at(-2, mat1_sizes);
183-
int out_tile_nrows = 4;
183+
uint32_t out_tile_nrows = 4;
184184
if (M % 6 == 0) {
185-
kernel_name += "_o4x6";
186-
out_tile_nrows = 6;
185+
kernel_name += "_o4x2";
186+
out_tile_nrows = 2;
187187
} else if (M % 4 == 0) {
188188
kernel_name += "_o4x4";
189189
out_tile_nrows = 4;
@@ -195,8 +195,11 @@ void add_q_8w_linear_tiled_node(
195195
out_tile_nrows = 4;
196196
}
197197

198-
utils::uvec3 global_wg_size = graph.logical_limits_of(out);
199-
global_wg_size[1] = global_wg_size[1] / out_tile_nrows;
198+
utils::uvec3 out_limits = graph.logical_limits_of(out);
199+
utils::uvec3 global_wg_size = {
200+
out_limits[0] * (utils::div_up(out_limits[1], out_tile_nrows)),
201+
1,
202+
out_limits[2]};
200203

201204
utils::uvec3 local_wg_size{64, 1, 1};
202205
if (use_coop_algorithm) {

exir/capture/_config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,3 +102,6 @@ class ExecutorchBackendConfig:
102102
# serialized in the PTE file. Its value is ignored if mutable buffers are not
103103
# memory planned as the names must be serialized in that case.
104104
emit_mutable_buffer_names: bool = False
105+
106+
# If set to true, we run quant fusion and constant propagation passes
107+
do_quant_fusion_and_const_prop: bool = False

0 commit comments

Comments
 (0)