Skip to content

Commit c63b55a

Browse files
authored
Merge branch 'main' into final-pt2e-fixups
2 parents b0c1875 + ce48e0d commit c63b55a

File tree

15 files changed

+462
-87
lines changed

15 files changed

+462
-87
lines changed

.lintrunner.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,8 @@ exclude_patterns = [
271271
'examples/**',
272272
'exir/verification/bindings.cpp',
273273
'extension/**',
274+
# Uses properly-gated (ET_USE_PYTORCH_HEADERS) ATen include.
275+
'kernels/portable/cpu/util/vectorized_math.h',
274276
'kernels/optimized/**',
275277
'runtime/core/exec_aten/**',
276278
# Want to be able to keep c10 in sync with PyTorch core.

backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -60,26 +60,25 @@ void main() {
6060
const uint div_by_x = gl_GlobalInvocationID.x / out_limits_xy_scaled.x;
6161
ivec3 pos = ivec3(
6262
gl_GlobalInvocationID.x % out_limits_xy_scaled.x,
63-
div_by_x % out_limits_xy_scaled.y,
64-
div_by_x / out_limits_xy_scaled.y);
65-
66-
// scale pos.xy by batch sizes, because that's the top pixel to be processed
67-
pos.x *= BATCH_SIZE_X;
68-
pos.y *= BATCH_SIZE_Y;
63+
div_by_x,
64+
gl_GlobalInvocationID.y);
6965

7066
// do not process if top pixel does not fit within the output range
71-
if (pos.z >= out_limits.z) {
67+
if (pos.y >= out_limits_xy_scaled.y || pos.z >= out_limits.z) {
7268
return;
7369
}
7470

71+
// scale pos.xy by batch sizes, because that's the top pixel to be processed
72+
pos.x *= BATCH_SIZE_X;
73+
pos.y *= BATCH_SIZE_Y;
74+
7575
// Compute the index of the top-left element of the overlay region. Negative
7676
// indices indicate that the top-left element is in a region added by padding.
7777
const ivec2 ipos = pos.xy * stride - padding;
7878

7979
// Compute the start and end of the input indices to load. Padding is assumed
8080
// to be constant 0 padding, so any reads from the padding region is skipped.
8181
const ivec2 start = ipos;
82-
const ivec2 end = ipos + overlay_region.xy;
8382

8483
// sum outputs
8584
VEC4_T sum[BATCH_SIZE_Y * BATCH_SIZE_X];

backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,11 @@ void main() {
5050
const uint div_by_x = gl_GlobalInvocationID.x / out_limits.x;
5151
const ivec3 pos = ivec3(
5252
gl_GlobalInvocationID.x % out_limits.x,
53-
div_by_x % out_limits.y,
54-
div_by_x / out_limits.y);
53+
div_by_x,
54+
gl_GlobalInvocationID.y);
5555

56-
if (pos.z >= out_limits.z) {
56+
// do not process if top pixel does not fit within the output range
57+
if (pos.y >= out_limits.y || pos.z >= out_limits.z) {
5758
return;
5859
}
5960

@@ -64,7 +65,6 @@ void main() {
6465
// Compute the start and end of the input indices to load. Padding is assumed
6566
// to be constant 0 padding, so any reads from the padding region is skipped.
6667
const ivec2 start = ipos;
67-
const ivec2 end = ipos + overlay_region.xy;
6868

6969
VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
7070
int kx = 0;

backends/vulkan/runtime/graph/ops/impl/Convolution.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -407,13 +407,11 @@ void add_conv2d_node(
407407
utils::uvec3 wg_size = create_conv2d_global_wg_size(
408408
graph, method, out, weight_data, stride_equals_dilation);
409409

410-
if (method == Conv2dMethod::Depthwise) {
411-
wg_size = {wg_size[0] * wg_size[1] * wg_size[2], 1, 1};
412-
} else if (method == Conv2dMethod::Pointwise) {
410+
utils::uvec3 local_wg_size;
411+
if (method == Conv2dMethod::Depthwise || method == Conv2dMethod::Pointwise) {
413412
wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1};
414413
}
415414

416-
utils::uvec3 local_wg_size;
417415
if (method == Conv2dMethod::Pointwise) {
418416
uint32_t local_wg_size_y = 1;
419417
if (wg_size[1] % 8 == 0) {
@@ -424,6 +422,8 @@ void add_conv2d_node(
424422
local_wg_size_y = 2;
425423
}
426424
local_wg_size = {64 / local_wg_size_y, local_wg_size_y, 1};
425+
} else if (method == Conv2dMethod::Depthwise) {
426+
local_wg_size = {64, 1, 1};
427427
} else {
428428
local_wg_size = graph.create_local_wg_size(wg_size);
429429
}

backends/vulkan/test/op_tests/cases.py

Lines changed: 131 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -279,17 +279,6 @@ def get_conv_inputs():
279279
output_padding=[0, 1],
280280
groups=1,
281281
),
282-
Test(
283-
self=(1, 8, 72, 96),
284-
weight=(8, 1, 3, 3),
285-
bias=(8,),
286-
stride=[1, 1],
287-
padding=[1, 1],
288-
dilation=[1, 1],
289-
transposed=False,
290-
output_padding=[0, 0],
291-
groups=8,
292-
),
293282
Test(
294283
self=(1, 6, 40, 50),
295284
weight=(8, 6, 3, 3),
@@ -345,39 +334,6 @@ def get_conv_inputs():
345334
output_padding=[0],
346335
groups=5,
347336
),
348-
Test(
349-
self=(1, 4, 234, 234),
350-
weight=(4, 1, 3, 3),
351-
bias=(4,),
352-
stride=[2, 1],
353-
padding=[1, 1],
354-
dilation=[1, 1],
355-
transposed=False,
356-
output_padding=[0, 0],
357-
groups=4,
358-
),
359-
Test(
360-
self=(1, 4, 234, 234),
361-
weight=(4, 1, 3, 3),
362-
bias=(4,),
363-
stride=[1, 2],
364-
padding=[1, 1],
365-
dilation=[1, 1],
366-
transposed=False,
367-
output_padding=[0, 0],
368-
groups=4,
369-
),
370-
Test(
371-
self=(1, 4, 234, 234),
372-
weight=(4, 1, 3, 3),
373-
bias=(4,),
374-
stride=[2, 2],
375-
padding=[1, 1],
376-
dilation=[1, 1],
377-
transposed=False,
378-
output_padding=[0, 0],
379-
groups=4,
380-
),
381337
Test(
382338
self=(1, 8, 90, 77),
383339
weight=(1, 8, 3, 3),
@@ -526,6 +482,130 @@ def get_conv_inputs():
526482
),
527483
]
528484

485+
test_cases_dw = [
486+
Test(
487+
self=(1, XS, S, S1),
488+
weight=(XS, 1, 3, 3),
489+
bias=(XS,),
490+
stride=[1, 1],
491+
padding=[1, 1],
492+
dilation=[1, 1],
493+
transposed=False,
494+
output_padding=[0, 0],
495+
groups=XS,
496+
),
497+
Test(
498+
self=(1, XS, S, S1),
499+
weight=(XS, 1, 5, 5),
500+
bias=(XS,),
501+
stride=[1, 1],
502+
padding=[2, 2],
503+
dilation=[1, 1],
504+
transposed=False,
505+
output_padding=[0, 0],
506+
groups=XS,
507+
),
508+
Test(
509+
self=(1, XS, S, S1),
510+
weight=(XS, 1, 3, 3),
511+
bias=(XS,),
512+
stride=[2, 1],
513+
padding=[1, 1],
514+
dilation=[1, 1],
515+
transposed=False,
516+
output_padding=[0, 0],
517+
groups=XS,
518+
),
519+
Test(
520+
self=(1, XS, S, S1),
521+
weight=(XS, 1, 5, 5),
522+
bias=(XS,),
523+
stride=[1, 2],
524+
padding=[2, 2],
525+
dilation=[1, 1],
526+
transposed=False,
527+
output_padding=[0, 0],
528+
groups=XS,
529+
),
530+
Test(
531+
self=(1, S2, S, S1),
532+
weight=(S2, 1, 3, 3),
533+
bias=(S2,),
534+
stride=[1, 1],
535+
padding=[1, 1],
536+
dilation=[1, 1],
537+
transposed=False,
538+
output_padding=[0, 0],
539+
groups=S2,
540+
),
541+
Test(
542+
self=(1, S2, S, S1),
543+
weight=(S2, 1, 5, 5),
544+
bias=(S2,),
545+
stride=[1, 1],
546+
padding=[2, 2],
547+
dilation=[1, 1],
548+
transposed=False,
549+
output_padding=[0, 0],
550+
groups=S2,
551+
),
552+
Test(
553+
self=(1, 8, 72, 96),
554+
weight=(8, 1, 3, 3),
555+
bias=(8,),
556+
stride=[1, 1],
557+
padding=[1, 1],
558+
dilation=[1, 1],
559+
transposed=False,
560+
output_padding=[0, 0],
561+
groups=8,
562+
),
563+
Test(
564+
self=(1, 8, 72, 96),
565+
weight=(8, 1, 5, 5),
566+
bias=(8,),
567+
stride=[1, 1],
568+
padding=[2, 2],
569+
dilation=[1, 1],
570+
transposed=False,
571+
output_padding=[0, 0],
572+
groups=8,
573+
),
574+
Test(
575+
self=(1, 4, 234, 234),
576+
weight=(4, 1, 3, 3),
577+
bias=(4,),
578+
stride=[2, 1],
579+
padding=[1, 1],
580+
dilation=[1, 1],
581+
transposed=False,
582+
output_padding=[0, 0],
583+
groups=4,
584+
),
585+
Test(
586+
self=(1, 4, 234, 234),
587+
weight=(4, 1, 3, 3),
588+
bias=(4,),
589+
stride=[1, 2],
590+
padding=[1, 1],
591+
dilation=[1, 1],
592+
transposed=False,
593+
output_padding=[0, 0],
594+
groups=4,
595+
),
596+
Test(
597+
self=(1, 4, 234, 234),
598+
weight=(4, 1, 3, 3),
599+
bias=(4,),
600+
stride=[2, 2],
601+
padding=[1, 1],
602+
dilation=[1, 1],
603+
transposed=False,
604+
output_padding=[0, 0],
605+
groups=4,
606+
),
607+
]
608+
529609
test_suite = VkTestSuite(test_cases)
530610
test_suite.layouts = [
531611
"utils::kChannelsPacked",
@@ -536,7 +616,13 @@ def get_conv_inputs():
536616
"utils::kChannelsPacked",
537617
]
538618
test_suite_pw.test_name_suffix = "pw"
539-
return [test_suite, test_suite_pw]
619+
620+
test_suite_dw = VkTestSuite(test_cases_dw)
621+
test_suite_dw.layouts = [
622+
"utils::kChannelsPacked",
623+
]
624+
test_suite_dw.test_name_suffix = "dw"
625+
return [test_suite, test_suite_pw, test_suite_dw]
540626

541627

542628
@register_test_suite("aten.native_layer_norm.default")

kernels/portable/CMakeLists.txt

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,15 @@ if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
6969
target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options})
7070
target_include_directories(optimized_portable_kernels PRIVATE ${TORCH_INCLUDE_DIRS})
7171
target_compile_definitions(optimized_portable_kernels PRIVATE ET_USE_PYTORCH_HEADERS)
72+
gen_selected_ops(LIB_NAME "optimized_portable_ops_lib" OPS_SCHEMA_YAML "${_yaml}")
73+
generate_bindings_for_kernels(
74+
LIB_NAME "optimized_portable_ops_lib" FUNCTIONS_YAML "${_yaml}"
75+
)
76+
gen_operators_lib(
77+
LIB_NAME "optimized_portable_ops_lib" KERNEL_LIBS optimized_portable_kernels DEPS executorch_core
78+
)
7279
install(
73-
TARGETS optimized_portable_kernels
80+
TARGETS optimized_portable_kernels optimized_portable_ops_lib
7481
DESTINATION lib
7582
)
7683
endif()

kernels/portable/cpu/util/targets.bzl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,16 @@ def define_common_targets():
307307
],
308308
)
309309

310+
runtime.cxx_library(
311+
name = "vectorized_math",
312+
exported_headers = ["vectorized_math.h"],
313+
visibility = ["//executorch/..."],
314+
exported_deps = [
315+
"//executorch/runtime/core/portable_type:portable_type",
316+
"//executorch/runtime/core/exec_aten/util:scalar_type_util",
317+
],
318+
)
319+
310320
# Utility functions that can be used by operators that perform reduction
311321
for aten_mode in get_aten_mode_options():
312322
suffix = "_aten" if aten_mode else ""

kernels/portable/cpu/util/test/CMakeLists.txt

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,26 +4,22 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7-
# @generated by test/utils/generate_gtest_cmakelists.py
8-
#
9-
# This file should be formatted with
10-
# ~~~
11-
# cmake-format -i CMakeLists.txt
12-
# ~~~
13-
# It should also be cmake-lint clean.
14-
#
15-
167
cmake_minimum_required(VERSION 3.19)
178

189
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..)
1910

2011
include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
12+
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
2113

2214
set(_test_srcs broadcast_indexes_range_test.cpp broadcast_test.cpp
23-
reduce_test.cpp
15+
reduce_test.cpp vectorized_math_test.cpp
2416
)
2517

2618
et_cxx_test(
2719
kernels_portable_cpu_util_test SOURCES ${_test_srcs} EXTRA_LIBS
2820
portable_kernels portable_ops_lib
2921
)
22+
23+
find_package_torch_headers()
24+
target_include_directories(kernels_portable_cpu_util_test PRIVATE ${TORCH_INCLUDE_DIRS})
25+
target_compile_definitions(kernels_portable_cpu_util_test PRIVATE ET_USE_PYTORCH_HEADERS)

kernels/portable/cpu/util/test/targets.bzl

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,14 @@ def define_common_targets():
3232
"//executorch/kernels/portable/cpu/util:reduce_util",
3333
],
3434
)
35+
36+
# this test requires ET_USE_PYTORCH_HEADERS, which doesn't work in OSS Buck.
37+
if not runtime.is_oss:
38+
runtime.cxx_test(
39+
name = "vectorized_math_test",
40+
srcs = ["vectorized_math_test.cpp"],
41+
deps = [
42+
"//executorch/kernels/portable/cpu/util:vectorized_math",
43+
"//executorch/runtime/core/portable_type/c10/c10:c10",
44+
],
45+
)

0 commit comments

Comments
 (0)