pytorch
diff --git a/‎.lintrunner.toml‎
Lines changed: 2 additions & 0 deletions b/‎.lintrunner.toml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl‎
Lines changed: 7 additions & 8 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl‎
Lines changed: 7 additions & 8 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl‎
Lines changed: 4 additions & 4 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/impl/Convolution.cpp‎
Lines changed: 4 additions & 4 deletions b/‎backends/vulkan/runtime/graph/ops/impl/Convolution.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎backends/vulkan/test/op_tests/cases.py‎
Lines changed: 131 additions & 45 deletions b/‎backends/vulkan/test/op_tests/cases.py‎
Lines changed: 131 additions & 45 deletions
diff --git a/‎kernels/portable/CMakeLists.txt‎
Lines changed: 8 additions & 1 deletion b/‎kernels/portable/CMakeLists.txt‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎kernels/portable/cpu/util/targets.bzl‎
Lines changed: 10 additions & 0 deletions b/‎kernels/portable/cpu/util/targets.bzl‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎kernels/portable/cpu/util/test/CMakeLists.txt‎
Lines changed: 6 additions & 10 deletions b/‎kernels/portable/cpu/util/test/CMakeLists.txt‎
Lines changed: 6 additions & 10 deletions
diff --git a/‎kernels/portable/cpu/util/test/targets.bzl‎
Lines changed: 11 additions & 0 deletions b/‎kernels/portable/cpu/util/test/targets.bzl‎
Lines changed: 11 additions & 0 deletions
@@ -271,6 +271,8 @@ exclude_patterns = [
     'examples/**',
     'exir/verification/bindings.cpp',
     'extension/**',
+    # Uses properly-gated (ET_USE_PYTORCH_HEADERS) ATen include.
+    'kernels/portable/cpu/util/vectorized_math.h',
     'kernels/optimized/**',
     'runtime/core/exec_aten/**',
     # Want to be able to keep c10 in sync with PyTorch core.
 
@@ -60,26 +60,25 @@ void main() {
   const uint div_by_x = gl_GlobalInvocationID.x / out_limits_xy_scaled.x;
   ivec3 pos = ivec3(
     gl_GlobalInvocationID.x % out_limits_xy_scaled.x,
-    div_by_x % out_limits_xy_scaled.y,
-    div_by_x / out_limits_xy_scaled.y);
-
-  // scale pos.xy by batch sizes, because that's the top pixel to be processed
-  pos.x *= BATCH_SIZE_X;
-  pos.y *= BATCH_SIZE_Y;
+    div_by_x,
+    gl_GlobalInvocationID.y);
 
   // do not process if top pixel does not fit within the output range
-  if (pos.z >= out_limits.z) {
+  if (pos.y >= out_limits_xy_scaled.y || pos.z >= out_limits.z) {
     return;
   }
 
+  // scale pos.xy by batch sizes, because that's the top pixel to be processed
+  pos.x *= BATCH_SIZE_X;
+  pos.y *= BATCH_SIZE_Y;
+
   // Compute the index of the top-left element of the overlay region. Negative
   // indices indicate that the top-left element is in a region added by padding.
   const ivec2 ipos = pos.xy * stride - padding;
 
   // Compute the start and end of the input indices to load. Padding is assumed
   // to be constant 0 padding, so any reads from the padding region is skipped.
   const ivec2 start = ipos;
-  const ivec2 end = ipos + overlay_region.xy;
 
   // sum outputs
   VEC4_T sum[BATCH_SIZE_Y * BATCH_SIZE_X];
 
@@ -50,10 +50,11 @@ void main() {
   const uint div_by_x = gl_GlobalInvocationID.x / out_limits.x;
   const ivec3 pos = ivec3(
     gl_GlobalInvocationID.x % out_limits.x,
-    div_by_x % out_limits.y,
-    div_by_x / out_limits.y);
+    div_by_x,
+    gl_GlobalInvocationID.y);
 
-  if (pos.z >= out_limits.z) {
+  // do not process if top pixel does not fit within the output range
+  if (pos.y >= out_limits.y || pos.z >= out_limits.z) {
     return;
   }
 
@@ -64,7 +65,6 @@ void main() {
   // Compute the start and end of the input indices to load. Padding is assumed
   // to be constant 0 padding, so any reads from the padding region is skipped.
   const ivec2 start = ipos;
-  const ivec2 end = ipos + overlay_region.xy;
 
   VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
   int kx = 0;
 
@@ -407,13 +407,11 @@ void add_conv2d_node(
   utils::uvec3 wg_size = create_conv2d_global_wg_size(
       graph, method, out, weight_data, stride_equals_dilation);
 
-  if (method == Conv2dMethod::Depthwise) {
-    wg_size = {wg_size[0] * wg_size[1] * wg_size[2], 1, 1};
-  } else if (method == Conv2dMethod::Pointwise) {
+  utils::uvec3 local_wg_size;
+  if (method == Conv2dMethod::Depthwise || method == Conv2dMethod::Pointwise) {
     wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1};
   }
 
-  utils::uvec3 local_wg_size;
   if (method == Conv2dMethod::Pointwise) {
     uint32_t local_wg_size_y = 1;
     if (wg_size[1] % 8 == 0) {
@@ -424,6 +422,8 @@ void add_conv2d_node(
       local_wg_size_y = 2;
     }
     local_wg_size = {64 / local_wg_size_y, local_wg_size_y, 1};
+  } else if (method == Conv2dMethod::Depthwise) {
+    local_wg_size = {64, 1, 1};
   } else {
     local_wg_size = graph.create_local_wg_size(wg_size);
   }
 
@@ -279,17 +279,6 @@ def get_conv_inputs():
             output_padding=[0, 1],
             groups=1,
         ),
-        Test(
-            self=(1, 8, 72, 96),
-            weight=(8, 1, 3, 3),
-            bias=(8,),
-            stride=[1, 1],
-            padding=[1, 1],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=8,
-        ),
         Test(
             self=(1, 6, 40, 50),
             weight=(8, 6, 3, 3),
@@ -345,39 +334,6 @@ def get_conv_inputs():
             output_padding=[0],
             groups=5,
         ),
-        Test(
-            self=(1, 4, 234, 234),
-            weight=(4, 1, 3, 3),
-            bias=(4,),
-            stride=[2, 1],
-            padding=[1, 1],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=4,
-        ),
-        Test(
-            self=(1, 4, 234, 234),
-            weight=(4, 1, 3, 3),
-            bias=(4,),
-            stride=[1, 2],
-            padding=[1, 1],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=4,
-        ),
-        Test(
-            self=(1, 4, 234, 234),
-            weight=(4, 1, 3, 3),
-            bias=(4,),
-            stride=[2, 2],
-            padding=[1, 1],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=4,
-        ),
         Test(
             self=(1, 8, 90, 77),
             weight=(1, 8, 3, 3),
@@ -526,6 +482,130 @@ def get_conv_inputs():
         ),
     ]
 
+    test_cases_dw = [
+        Test(
+            self=(1, XS, S, S1),
+            weight=(XS, 1, 3, 3),
+            bias=(XS,),
+            stride=[1, 1],
+            padding=[1, 1],
+            dilation=[1, 1],
+            transposed=False,
+            output_padding=[0, 0],
+            groups=XS,
+        ),
+        Test(
+            self=(1, XS, S, S1),
+            weight=(XS, 1, 5, 5),
+            bias=(XS,),
+            stride=[1, 1],
+            padding=[2, 2],
+            dilation=[1, 1],
+            transposed=False,
+            output_padding=[0, 0],
+            groups=XS,
+        ),
+        Test(
+            self=(1, XS, S, S1),
+            weight=(XS, 1, 3, 3),
+            bias=(XS,),
+            stride=[2, 1],
+            padding=[1, 1],
+            dilation=[1, 1],
+            transposed=False,
+            output_padding=[0, 0],
+            groups=XS,
+        ),
+        Test(
+            self=(1, XS, S, S1),
+            weight=(XS, 1, 5, 5),
+            bias=(XS,),
+            stride=[1, 2],
+            padding=[2, 2],
+            dilation=[1, 1],
+            transposed=False,
+            output_padding=[0, 0],
+            groups=XS,
+        ),
+        Test(
+            self=(1, S2, S, S1),
+            weight=(S2, 1, 3, 3),
+            bias=(S2,),
+            stride=[1, 1],
+            padding=[1, 1],
+            dilation=[1, 1],
+            transposed=False,
+            output_padding=[0, 0],
+            groups=S2,
+        ),
+        Test(
+            self=(1, S2, S, S1),
+            weight=(S2, 1, 5, 5),
+            bias=(S2,),
+            stride=[1, 1],
+            padding=[2, 2],
+            dilation=[1, 1],
+            transposed=False,
+            output_padding=[0, 0],
+            groups=S2,
+        ),
+        Test(
+            self=(1, 8, 72, 96),
+            weight=(8, 1, 3, 3),
+            bias=(8,),
+            stride=[1, 1],
+            padding=[1, 1],
+            dilation=[1, 1],
+            transposed=False,
+            output_padding=[0, 0],
+            groups=8,
+        ),
+        Test(
+            self=(1, 8, 72, 96),
+            weight=(8, 1, 5, 5),
+            bias=(8,),
+            stride=[1, 1],
+            padding=[2, 2],
+            dilation=[1, 1],
+            transposed=False,
+            output_padding=[0, 0],
+            groups=8,
+        ),
+        Test(
+            self=(1, 4, 234, 234),
+            weight=(4, 1, 3, 3),
+            bias=(4,),
+            stride=[2, 1],
+            padding=[1, 1],
+            dilation=[1, 1],
+            transposed=False,
+            output_padding=[0, 0],
+            groups=4,
+        ),
+        Test(
+            self=(1, 4, 234, 234),
+            weight=(4, 1, 3, 3),
+            bias=(4,),
+            stride=[1, 2],
+            padding=[1, 1],
+            dilation=[1, 1],
+            transposed=False,
+            output_padding=[0, 0],
+            groups=4,
+        ),
+        Test(
+            self=(1, 4, 234, 234),
+            weight=(4, 1, 3, 3),
+            bias=(4,),
+            stride=[2, 2],
+            padding=[1, 1],
+            dilation=[1, 1],
+            transposed=False,
+            output_padding=[0, 0],
+            groups=4,
+        ),
+    ]
+
     test_suite = VkTestSuite(test_cases)
     test_suite.layouts = [
         "utils::kChannelsPacked",
@@ -536,7 +616,13 @@ def get_conv_inputs():
         "utils::kChannelsPacked",
     ]
     test_suite_pw.test_name_suffix = "pw"
-    return [test_suite, test_suite_pw]
+
+    test_suite_dw = VkTestSuite(test_cases_dw)
+    test_suite_dw.layouts = [
+        "utils::kChannelsPacked",
+    ]
+    test_suite_dw.test_name_suffix = "dw"
+    return [test_suite, test_suite_pw, test_suite_dw]
 
 
 @register_test_suite("aten.native_layer_norm.default")
 
@@ -69,8 +69,15 @@ if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options})
   target_include_directories(optimized_portable_kernels PRIVATE ${TORCH_INCLUDE_DIRS})
   target_compile_definitions(optimized_portable_kernels PRIVATE ET_USE_PYTORCH_HEADERS)
+  gen_selected_ops(LIB_NAME "optimized_portable_ops_lib" OPS_SCHEMA_YAML "${_yaml}")
+  generate_bindings_for_kernels(
+    LIB_NAME "optimized_portable_ops_lib" FUNCTIONS_YAML "${_yaml}"
+  )
+  gen_operators_lib(
+    LIB_NAME "optimized_portable_ops_lib" KERNEL_LIBS optimized_portable_kernels DEPS executorch_core
+  )
   install(
-    TARGETS optimized_portable_kernels
+    TARGETS optimized_portable_kernels optimized_portable_ops_lib
     DESTINATION lib
   )
 endif()
 
@@ -307,6 +307,16 @@ def define_common_targets():
         ],
     )
 
+    runtime.cxx_library(
+        name = "vectorized_math",
+        exported_headers = ["vectorized_math.h"],
+        visibility = ["//executorch/..."],
+        exported_deps = [
+            "//executorch/runtime/core/portable_type:portable_type",
+            "//executorch/runtime/core/exec_aten/util:scalar_type_util",
+        ],
+    )
+
     # Utility functions that can be used by operators that perform reduction
     for aten_mode in get_aten_mode_options():
         suffix = "_aten" if aten_mode else ""
 
@@ -4,26 +4,22 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# @generated by test/utils/generate_gtest_cmakelists.py
-#
-# This file should be formatted with
-# ~~~
-# cmake-format -i CMakeLists.txt
-# ~~~
-# It should also be cmake-lint clean.
-#
-
 cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..)
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 set(_test_srcs broadcast_indexes_range_test.cpp broadcast_test.cpp
-               reduce_test.cpp
+               reduce_test.cpp vectorized_math_test.cpp
 )
 
 et_cxx_test(
   kernels_portable_cpu_util_test SOURCES ${_test_srcs} EXTRA_LIBS
   portable_kernels portable_ops_lib
 )
+
+find_package_torch_headers()
+target_include_directories(kernels_portable_cpu_util_test PRIVATE ${TORCH_INCLUDE_DIRS})
+target_compile_definitions(kernels_portable_cpu_util_test PRIVATE ET_USE_PYTORCH_HEADERS)
@@ -32,3 +32,14 @@ def define_common_targets():
             "//executorch/kernels/portable/cpu/util:reduce_util",
         ],
     )
+
+    # this test requires ET_USE_PYTORCH_HEADERS, which doesn't work in OSS Buck.
+    if not runtime.is_oss:
+        runtime.cxx_test(
+            name = "vectorized_math_test",
+            srcs = ["vectorized_math_test.cpp"],
+            deps = [
+                "//executorch/kernels/portable/cpu/util:vectorized_math",
+                "//executorch/runtime/core/portable_type/c10/c10:c10",
+            ],
+        )