From 72fb9b721c4be46097bad3ec4bfd574e57f27e89 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 27 May 2025 12:30:31 -0700 Subject: [PATCH 1/9] Update [ghstack-poisoned] --- .ci/docker/ci_commit_pins/pytorch.txt | 2 +- install_requirements.py | 2 +- runtime/core/portable_type/c10/c10/macros/Macros.h | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt index 75a95d0522b..306a7bf0a4a 100644 --- a/.ci/docker/ci_commit_pins/pytorch.txt +++ b/.ci/docker/ci_commit_pins/pytorch.txt @@ -1 +1 @@ -01f1cc44cbbfdf6307aa01b803a4ee22f9ade946 +b40585022f80385c0bbf5c0d08c172c391ed2318 diff --git a/install_requirements.py b/install_requirements.py index 2fcd65ea338..8f8ad106c5d 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -71,7 +71,7 @@ def python_is_compatible(): # # NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt # by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/ -NIGHTLY_VERSION = "dev20250422" +NIGHTLY_VERSION = "dev20250527" def install_requirements(use_pytorch_nightly): diff --git a/runtime/core/portable_type/c10/c10/macros/Macros.h b/runtime/core/portable_type/c10/c10/macros/Macros.h index 7e61ad7e26b..0947be6c0d0 100644 --- a/runtime/core/portable_type/c10/c10/macros/Macros.h +++ b/runtime/core/portable_type/c10/c10/macros/Macros.h @@ -241,7 +241,7 @@ using namespace c10::xpu; #ifdef __HIPCC__ // Unlike CUDA, HIP requires a HIP header to be included for __host__ to work. // We do this #include here so that C10_HOST_DEVICE and friends will Just Work. -// See https://github.com/ROCm-Developer-Tools/HIP/issues/441 +// See https://github.com/ROCm/hip/issues/441 #include #endif @@ -286,7 +286,7 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256; #define C10_MIN_BLOCKS_PER_SM(threads_per_block, blocks_per_sm) \ ((((threads_per_block) * (blocks_per_sm) <= CUDA_MAX_THREADS_PER_SM) \ ? (blocks_per_sm) \ - : ((CUDA_MAX_THREADS_PER_SM + (threads_per_block)-1) / \ + : ((CUDA_MAX_THREADS_PER_SM + (threads_per_block) - 1) / \ (threads_per_block)))) // C10_LAUNCH_BOUNDS is analogous to __launch_bounds__ #define C10_LAUNCH_BOUNDS_0 \ From 0b369e7745149dceae83728b15716a788f705b1b Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 27 May 2025 12:40:27 -0700 Subject: [PATCH 2/9] 05-27 doesn't seem to be working, try 05-26 [ghstack-poisoned] --- .ci/docker/ci_commit_pins/pytorch.txt | 2 +- install_requirements.py | 2 +- runtime/core/portable_type/c10/c10/util/BFloat16-inl.h | 5 +---- runtime/core/portable_type/c10/c10/util/BFloat16.h | 5 +---- 4 files changed, 4 insertions(+), 10 deletions(-) diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt index 306a7bf0a4a..32d0140c45e 100644 --- a/.ci/docker/ci_commit_pins/pytorch.txt +++ b/.ci/docker/ci_commit_pins/pytorch.txt @@ -1 +1 @@ -b40585022f80385c0bbf5c0d08c172c391ed2318 +8c16d0e4047a8ac5885baf52e8779fb3e36f2987 diff --git a/install_requirements.py b/install_requirements.py index 8f8ad106c5d..b9b6c72142f 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -71,7 +71,7 @@ def python_is_compatible(): # # NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt # by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/ -NIGHTLY_VERSION = "dev20250527" +NIGHTLY_VERSION = "dev20250526" def install_requirements(use_pytorch_nightly): diff --git a/runtime/core/portable_type/c10/c10/util/BFloat16-inl.h b/runtime/core/portable_type/c10/c10/util/BFloat16-inl.h index 10ab0c828d7..1ed866f78d9 100644 --- a/runtime/core/portable_type/c10/c10/util/BFloat16-inl.h +++ b/runtime/core/portable_type/c10/c10/util/BFloat16-inl.h @@ -10,14 +10,11 @@ C10_CLANG_DIAGNOSTIC_PUSH() C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion") #endif -#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS) #if defined(CL_SYCL_LANGUAGE_VERSION) #include // for SYCL 1.2.1 -#else +#elif defined(SYCL_LANGUAGE_VERSION) #include // for SYCL 2020 #endif -#include -#endif namespace c10 { diff --git a/runtime/core/portable_type/c10/c10/util/BFloat16.h b/runtime/core/portable_type/c10/c10/util/BFloat16.h index 93d0ec54fb0..0f7cecda46b 100644 --- a/runtime/core/portable_type/c10/c10/util/BFloat16.h +++ b/runtime/core/portable_type/c10/c10/util/BFloat16.h @@ -14,14 +14,11 @@ #include #endif -#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS) #if defined(CL_SYCL_LANGUAGE_VERSION) #include // for SYCL 1.2.1 -#else +#elif defined(SYCL_LANGUAGE_VERSION) #include // for SYCL 2020 #endif -#include -#endif namespace c10 { From d57b556fe414e80c673461dc895ac5412fad3810 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 27 May 2025 12:56:45 -0700 Subject: [PATCH 3/9] Update [ghstack-poisoned] --- .ci/docker/ci_commit_pins/pytorch.txt | 2 +- install_requirements.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt index 32d0140c45e..b7ce7c5ce7e 100644 --- a/.ci/docker/ci_commit_pins/pytorch.txt +++ b/.ci/docker/ci_commit_pins/pytorch.txt @@ -1 +1 @@ -8c16d0e4047a8ac5885baf52e8779fb3e36f2987 +53ecb8159aa28b3c015917acaa89604cfae0d2c6 diff --git a/install_requirements.py b/install_requirements.py index b9b6c72142f..31b843c83a7 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -71,7 +71,7 @@ def python_is_compatible(): # # NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt # by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/ -NIGHTLY_VERSION = "dev20250526" +NIGHTLY_VERSION = "dev20250525" def install_requirements(use_pytorch_nightly): From 0be8b6eebf1c6eed63c9fd389160faf1f8f71b92 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 27 May 2025 21:07:21 -0700 Subject: [PATCH 4/9] update typing-extensions version in buck [ghstack-poisoned] --- pyproject.toml | 1 + third-party/TARGETS | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 70fbbea18e5..7faa2c53304 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,6 +66,7 @@ dependencies=[ "ruamel.yaml", "sympy", "tabulate", + # See also third-party/TARGETS for buck's typing-extensions version. "typing-extensions>=4.10.0", # Keep this version in sync with: ./backends/apple/coreml/scripts/install_requirements.sh "coremltools==8.3; platform_system == 'Darwin'", diff --git a/third-party/TARGETS b/third-party/TARGETS index 0ec62c1536f..c80bd9448b3 100644 --- a/third-party/TARGETS +++ b/third-party/TARGETS @@ -23,9 +23,9 @@ prebuilt_python_library_defs = { "url": "https://files.pythonhosted.org/packages/12/fc/a4d5a7554e0067677823f7265cb3ae22aed8a238560b5133b58cda252dad/PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", }, "typing-extensions": { - "out": "typing_extensions-4.2.0-py3-none-any.whl", - "sha1": "ff0849420e94f425818bff5d0f25e3cdfaba8601", - "url": "https://files.pythonhosted.org/packages/75/e1/932e06004039dd670c9d5e1df0cd606bf46e29a28e65d5bb28e894ea29c9/typing_extensions-4.2.0-py3-none-any.whl", + "out": "typing_extensions-4.13.2-py3-none-any.whl", + "sha1": "85a14b4d38ca0e528328b6b591769e1d989f12b8", + "url": "https://files.pythonhosted.org/packages/8b/54/b1ae86c0973cc6f0210b53d508ca3641fb6d0c56823f288d108bc7ab3cc8/typing_extensions-4.13.2-py3-none-any.whl", }, "wcwidth": { "out": "wcwidth-0.1.5-py2.py3-none-any.whl", From d9f3957a84feea51a6db177bee81b013e261121f Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 3 Jun 2025 11:00:13 -0700 Subject: [PATCH 5/9] Update [ghstack-poisoned] --- .ci/docker/ci_commit_pins/pytorch.txt | 2 +- install_requirements.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt index 75a95d0522b..40a61733e8d 100644 --- a/.ci/docker/ci_commit_pins/pytorch.txt +++ b/.ci/docker/ci_commit_pins/pytorch.txt @@ -1 +1 @@ -01f1cc44cbbfdf6307aa01b803a4ee22f9ade946 +64247892a0ca8ed045ad0b530eb87c3dd66590ea diff --git a/install_requirements.py b/install_requirements.py index 2fcd65ea338..31ce482b317 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -71,7 +71,7 @@ def python_is_compatible(): # # NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt # by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/ -NIGHTLY_VERSION = "dev20250422" +NIGHTLY_VERSION = "dev20250602" def install_requirements(use_pytorch_nightly): From 845b14d09bfd028f9923094ec25931a47b08da39 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 3 Jun 2025 11:10:10 -0700 Subject: [PATCH 6/9] Update [ghstack-poisoned] --- install_requirements.py | 2 +- runtime/core/portable_type/c10/c10/macros/Macros.h | 4 ++-- runtime/core/portable_type/c10/c10/util/BFloat16-inl.h | 5 +---- runtime/core/portable_type/c10/c10/util/BFloat16.h | 5 +---- 4 files changed, 5 insertions(+), 11 deletions(-) diff --git a/install_requirements.py b/install_requirements.py index 31ce482b317..dcbf7e160ec 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -82,7 +82,7 @@ def install_requirements(use_pytorch_nightly): # been installed on CI before this step, so pip won't reinstall them f"torch==2.8.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch", ( - f"torchvision==0.22.0.{NIGHTLY_VERSION}" + f"torchvision==0.23.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torchvision" ), # For testing. diff --git a/runtime/core/portable_type/c10/c10/macros/Macros.h b/runtime/core/portable_type/c10/c10/macros/Macros.h index 7e61ad7e26b..0947be6c0d0 100644 --- a/runtime/core/portable_type/c10/c10/macros/Macros.h +++ b/runtime/core/portable_type/c10/c10/macros/Macros.h @@ -241,7 +241,7 @@ using namespace c10::xpu; #ifdef __HIPCC__ // Unlike CUDA, HIP requires a HIP header to be included for __host__ to work. // We do this #include here so that C10_HOST_DEVICE and friends will Just Work. -// See https://github.com/ROCm-Developer-Tools/HIP/issues/441 +// See https://github.com/ROCm/hip/issues/441 #include #endif @@ -286,7 +286,7 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256; #define C10_MIN_BLOCKS_PER_SM(threads_per_block, blocks_per_sm) \ ((((threads_per_block) * (blocks_per_sm) <= CUDA_MAX_THREADS_PER_SM) \ ? (blocks_per_sm) \ - : ((CUDA_MAX_THREADS_PER_SM + (threads_per_block)-1) / \ + : ((CUDA_MAX_THREADS_PER_SM + (threads_per_block) - 1) / \ (threads_per_block)))) // C10_LAUNCH_BOUNDS is analogous to __launch_bounds__ #define C10_LAUNCH_BOUNDS_0 \ diff --git a/runtime/core/portable_type/c10/c10/util/BFloat16-inl.h b/runtime/core/portable_type/c10/c10/util/BFloat16-inl.h index 10ab0c828d7..1ed866f78d9 100644 --- a/runtime/core/portable_type/c10/c10/util/BFloat16-inl.h +++ b/runtime/core/portable_type/c10/c10/util/BFloat16-inl.h @@ -10,14 +10,11 @@ C10_CLANG_DIAGNOSTIC_PUSH() C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion") #endif -#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS) #if defined(CL_SYCL_LANGUAGE_VERSION) #include // for SYCL 1.2.1 -#else +#elif defined(SYCL_LANGUAGE_VERSION) #include // for SYCL 2020 #endif -#include -#endif namespace c10 { diff --git a/runtime/core/portable_type/c10/c10/util/BFloat16.h b/runtime/core/portable_type/c10/c10/util/BFloat16.h index 93d0ec54fb0..0f7cecda46b 100644 --- a/runtime/core/portable_type/c10/c10/util/BFloat16.h +++ b/runtime/core/portable_type/c10/c10/util/BFloat16.h @@ -14,14 +14,11 @@ #include #endif -#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS) #if defined(CL_SYCL_LANGUAGE_VERSION) #include // for SYCL 1.2.1 -#else +#elif defined(SYCL_LANGUAGE_VERSION) #include // for SYCL 2020 #endif -#include -#endif namespace c10 { From 20407f0e6701aa54dea329b49ec03dad9e6af361 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 3 Jun 2025 11:17:28 -0700 Subject: [PATCH 7/9] Update [ghstack-poisoned] --- install_requirements.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install_requirements.py b/install_requirements.py index dcbf7e160ec..38188d08300 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -89,7 +89,7 @@ def install_requirements(use_pytorch_nightly): ] EXAMPLES_REQUIREMENTS = [ - f"torchaudio==2.6.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torchaudio", + f"torchaudio==2.8.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torchaudio", ] # Assemble the list of requirements to actually install. From 5f85bcad58f16b72c714ecd7cbd4d9ea79a253cb Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 3 Jun 2025 12:24:19 -0700 Subject: [PATCH 8/9] Update [ghstack-poisoned] --- .ci/docker/ci_commit_pins/pytorch.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt index 40a61733e8d..6982ba9c780 100644 --- a/.ci/docker/ci_commit_pins/pytorch.txt +++ b/.ci/docker/ci_commit_pins/pytorch.txt @@ -1 +1 @@ -64247892a0ca8ed045ad0b530eb87c3dd66590ea +0d0058d90de410cbc998089eb5e475776d2ad55d From 3a2516a7df6046d41bd156ecd883b0caa1b31f68 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 3 Jun 2025 14:12:32 -0700 Subject: [PATCH 9/9] Update [ghstack-poisoned] --- .Package.swift/kernels_portable/dummy.swift | 0 .../kernels_portable_debug/dummy.swift | 0 .ci/docker/ci_commit_pins/pytorch.txt | 2 +- .ci/scripts/test_ios_ci.sh | 4 - .ci/scripts/test_model.sh | 42 +- .github/scripts/label_utils.py | 4 +- .github/scripts/trymerge.py | 10 +- .github/workflows/apple-perf.yml | 9 - .github/workflows/apple.yml | 9 - .github/workflows/check-labels.yml | 2 +- .github/workflows/trunk.yml | 10 +- CMakeLists.txt | 115 +++-- CMakePresets.json | 6 +- Package.swift | 94 ++-- .../delegate/ETCoreMLDefaultModelExecutor.mm | 2 +- .../runtime/delegate/ETCoreMLModelManager.mm | 4 +- backends/apple/mps/install_requirements.sh | 8 - backends/apple/mps/setup.md | 8 +- backends/arm/_passes/__init__.py | 1 + backends/arm/_passes/arm_pass_manager.py | 3 + .../arm/_passes/decompose_groupnorm_pass.py | 208 ++++++++ .../arm/_passes/decompose_layernorm_pass.py | 8 +- .../tosa_supported_operators.py | 2 + backends/arm/operators/op_conv2d.py | 12 +- backends/arm/operators/op_view.py | 8 +- backends/arm/scripts/parse_test_names.py | 1 + backends/arm/scripts/pre-push | 4 +- backends/arm/test/ops/test_group_norm.py | 145 ++++++ .../arm/test/ops/test_linalg_vector_norm.py | 6 +- backends/cadence/aot/TARGETS | 3 + backends/cadence/aot/memory_constraints.py | 24 +- backends/cadence/aot/ops_registrations.py | 33 ++ backends/cadence/aot/replace_ops.py | 16 +- .../cadence/aot/tests/test_memory_passes.py | 464 ++++++++++++------ .../aot/tests/test_replace_ops_passes.py | 40 +- backends/cadence/utils/facto_util.py | 54 +- backends/qualcomm/builders/op_avg_pool2d.py | 47 +- backends/qualcomm/quantizer/annotators.py | 1 + backends/qualcomm/tests/models.py | 17 +- backends/qualcomm/tests/test_qnn_delegate.py | 135 ++++- backends/qualcomm/utils/utils.py | 52 +- .../_passes/squeeze_unsqueeze_inputs.py | 8 +- backends/vulkan/runtime/VulkanBackend.cpp | 7 + backends/vulkan/runtime/gen_vulkan_spv.py | 18 +- .../vulkan/runtime/graph/ComputeGraph.cpp | 22 +- backends/vulkan/runtime/graph/ComputeGraph.h | 27 +- .../vulkan/runtime/graph/ops/DispatchNode.cpp | 26 +- .../vulkan/runtime/graph/ops/DispatchNode.h | 6 + .../runtime/graph/ops/DynamicDispatchNode.cpp | 58 ++- .../runtime/graph/ops/DynamicDispatchNode.h | 15 + .../vulkan/runtime/graph/ops/ExecuteNode.h | 2 +- .../graph/ops/glsl/buffer_to_nchw.yaml | 1 + .../graph/ops/glsl/conv2d_dw_output_tile.glsl | 28 +- .../runtime/graph/ops/glsl/conv2d_pw.glsl | 4 +- .../graph/ops/glsl/conv2d_pw_s1p0.glsl | 35 +- .../runtime/graph/ops/glsl/image_to_nchw.yaml | 1 + .../nchw_to_bitw8_image_nobitw8buffer.glsl | 8 +- .../nchw_to_bitw8_image_nobitw8buffer.yaml | 3 + .../graph/ops/glsl/nchw_to_buffer.glsl | 14 +- .../graph/ops/glsl/nchw_to_buffer.yaml | 4 + .../runtime/graph/ops/glsl/nchw_to_image.glsl | 14 +- .../runtime/graph/ops/glsl/nchw_to_image.yaml | 10 + .../runtime/graph/ops/glsl/select.glslh | 74 +++ .../graph/ops/glsl/select_batch_4d.glsl | 52 -- .../graph/ops/glsl/select_channel_3d.glsl | 50 -- .../graph/ops/glsl/select_channel_4d.glsl | 65 --- .../graph/ops/glsl/select_height_3d.glsl | 62 --- .../graph/ops/glsl/select_height_3d.yaml | 10 - .../graph/ops/glsl/select_height_4d.glsl | 64 --- .../graph/ops/glsl/select_height_4d.yaml | 10 - .../graph/ops/glsl/select_width_3d.glsl | 63 --- .../graph/ops/glsl/select_width_3d.yaml | 10 - .../graph/ops/glsl/select_width_4d.glsl | 67 --- .../graph/ops/glsl/select_width_4d.yaml | 10 - .../vulkan/runtime/graph/ops/glsl/slice.glslh | 53 ++ .../graph/ops/glsl/slice_packed_dim.glsl | 67 --- .../graph/ops/glsl/slice_packed_dim.yaml | 11 - .../graph/ops/glsl/slice_unpacked_dim.glsl | 68 --- .../graph/ops/glsl/slice_unpacked_dim.yaml | 10 - .../graph/ops/glsl/transfer_buffer.glsl | 58 +++ ...t_channel_3d.yaml => transfer_buffer.yaml} | 9 +- .../graph/ops/glsl/transfer_texture.glsl | 83 ++++ ..._channel_4d.yaml => transfer_texture.yaml} | 9 +- .../vulkan/runtime/graph/ops/glsl/where.glsl | 111 +++++ .../glsl/{select_batch_4d.yaml => where.yaml} | 8 +- .../vulkan/runtime/graph/ops/impl/Clone.cpp | 4 +- .../vulkan/runtime/graph/ops/impl/Common.cpp | 33 ++ .../vulkan/runtime/graph/ops/impl/Common.h | 47 ++ .../runtime/graph/ops/impl/Convolution.cpp | 5 +- .../vulkan/runtime/graph/ops/impl/Select.cpp | 193 ++++---- .../vulkan/runtime/graph/ops/impl/Slice.cpp | 230 ++++----- .../vulkan/runtime/graph/ops/impl/Staging.cpp | 33 +- .../runtime/graph/ops/impl/Transfer.cpp | 114 +++++ .../vulkan/runtime/graph/ops/impl/Transfer.h | 40 ++ .../vulkan/runtime/graph/ops/impl/Where.cpp | 126 +++++ .../graph/ops/utils/ShaderNameUtils.cpp | 1 + .../runtime/graph/ops/utils/StagingUtils.cpp | 12 +- .../runtime/graph/ops/utils/StagingUtils.h | 3 +- backends/vulkan/runtime/vk_api/Types.h | 2 +- backends/vulkan/test/op_tests/cases.py | 34 +- .../op_tests/utils/gen_correctness_base.py | 8 +- .../test/op_tests/utils/gen_correctness_vk.py | 2 + backends/vulkan/test/test_vulkan_delegate.py | 47 ++ backends/vulkan/test/utils/test_utils.cpp | 6 +- .../vulkan/test/vulkan_compute_api_test.cpp | 22 +- backends/xnnpack/CMakeLists.txt | 46 +- backends/xnnpack/README.md | 6 +- backends/xnnpack/operators/node_visitor.py | 37 +- backends/xnnpack/runtime/XNNCompiler.cpp | 43 +- .../xnnpack/serialization/runtime_schema.fbs | 6 +- backends/xnnpack/serialization/schema.fbs | 6 +- .../serialization/xnnpack_graph_schema.py | 10 + backends/xnnpack/third-party/cpuinfo | 2 +- codegen/api/et_cpp.py | 18 +- codegen/api/types/__init__.py | 4 +- codegen/api/types/signatures.py | 5 +- codegen/gen.py | 35 +- codegen/test/test_executorch_custom_ops.py | 2 +- codegen/test/test_executorch_gen.py | 8 +- codegen/tools/gen_oplist.py | 1 + devtools/etrecord/_etrecord.py | 35 ++ devtools/etrecord/tests/etrecord_test.py | 19 +- .../_intermediate_output_capturer.py | 39 +- .../intermediate_output_capturer_test.py | 2 - docs/source/_static/img/swiftpm_xcode2.png | Bin 60796 -> 55550 bytes .../backend-delegates-xnnpack-reference.md | 2 +- docs/source/backends-mps.md | 8 +- .../tutorial-xnnpack-delegate-lowering.md | 10 +- .../using-executorch-building-from-source.md | 20 +- docs/source/using-executorch-ios.md | 10 +- examples/arm/aot_arm_compiler.py | 10 +- .../LLaMA/LLaMA.xcodeproj/project.pbxproj | 2 +- .../LLaMA/docs/delegates/mps_README.md | 1 - .../demo-apps/react-native/rnllama/README.md | 2 +- .../ios/rnllama.xcodeproj/project.pbxproj | 2 +- examples/models/llama/README.md | 18 +- .../llama/source_transformation/quantize.py | 58 +-- examples/models/phi-3-mini/README.md | 2 +- examples/models/qwen3/README.md | 2 +- examples/qualcomm/oss_scripts/deit.py | 148 ++++++ examples/qualcomm/oss_scripts/efficientnet.py | 145 ++++++ .../oss_scripts/llama/runner/runner.cpp | 6 +- examples/xnnpack/README.md | 10 +- exir/passes/constant_prop_pass.py | 37 +- exir/tests/test_passes.py | 28 ++ export/TARGETS | 1 + export/export.py | 67 ++- export/recipe.py | 9 +- .../LlmModuleInstrumentationTest.kt | 32 +- .../org/pytorch/executorch/ModuleE2ETest.kt | 7 +- .../executorch/ModuleInstrumentationTest.kt | 8 +- .../org/pytorch/executorch/TestFileUtils.kt | 16 + .../Exported/ExecuTorch+Tensor.swift | 4 +- .../ExecuTorch/Exported/ExecuTorchTensor.h | 4 +- .../ExecuTorch/__tests__/TensorTest.swift | 305 +++++------- .../Benchmark.xcodeproj/project.pbxproj | 6 +- extension/llm/tokenizers | 2 +- extension/pybindings/README.md | 20 +- extension/threadpool/cpuinfo_utils.cpp | 26 + install_executorch.py | 88 +--- install_requirements.py | 2 +- kernels/aten/functions.yaml | 4 + kernels/optimized/cpu/op_linear.cpp | 134 ++++- kernels/portable/cpu/op_rand.cpp | 50 ++ kernels/portable/cpu/op_randn.cpp | 50 ++ kernels/portable/functions.yaml | 12 + kernels/test/CMakeLists.txt | 2 + kernels/test/op_linear_test.cpp | 98 +++- kernels/test/op_rand_test.cpp | 95 ++++ kernels/test/op_randn_test.cpp | 93 ++++ kernels/test/targets.bzl | 2 + pytest.ini | 2 + runtime/backend/backend_init_context.h | 8 +- .../core/portable_type/c10/c10/targets.bzl | 2 - scripts/build_apple_frameworks.sh | 9 - scripts/test_ios.sh | 4 - .../kernels/portable/op_registration_util.bzl | 16 + third-party/TARGETS | 12 - tools/cmake/Codegen.cmake | 3 +- 179 files changed, 3960 insertions(+), 1988 deletions(-) delete mode 100644 .Package.swift/kernels_portable/dummy.swift delete mode 100644 .Package.swift/kernels_portable_debug/dummy.swift delete mode 100755 backends/apple/mps/install_requirements.sh create mode 100644 backends/arm/_passes/decompose_groupnorm_pass.py create mode 100644 backends/arm/test/ops/test_group_norm.py create mode 100644 backends/vulkan/runtime/graph/ops/glsl/select.glslh delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.glsl delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.glsl delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.glsl delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/select_height_3d.glsl delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/select_height_3d.yaml delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/select_height_4d.glsl delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/select_height_4d.yaml delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/select_width_3d.glsl delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/select_width_3d.yaml delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/select_width_4d.glsl delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/select_width_4d.yaml create mode 100644 backends/vulkan/runtime/graph/ops/glsl/slice.glslh delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.glsl delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.yaml delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.glsl delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.yaml create mode 100644 backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl rename backends/vulkan/runtime/graph/ops/glsl/{select_channel_3d.yaml => transfer_buffer.yaml} (54%) create mode 100644 backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl rename backends/vulkan/runtime/graph/ops/glsl/{select_channel_4d.yaml => transfer_texture.yaml} (52%) create mode 100644 backends/vulkan/runtime/graph/ops/glsl/where.glsl rename backends/vulkan/runtime/graph/ops/glsl/{select_batch_4d.yaml => where.yaml} (64%) create mode 100644 backends/vulkan/runtime/graph/ops/impl/Common.cpp create mode 100644 backends/vulkan/runtime/graph/ops/impl/Common.h create mode 100644 backends/vulkan/runtime/graph/ops/impl/Transfer.cpp create mode 100644 backends/vulkan/runtime/graph/ops/impl/Transfer.h create mode 100644 backends/vulkan/runtime/graph/ops/impl/Where.cpp create mode 100644 examples/qualcomm/oss_scripts/deit.py create mode 100644 examples/qualcomm/oss_scripts/efficientnet.py create mode 100644 extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/TestFileUtils.kt create mode 100644 kernels/portable/cpu/op_rand.cpp create mode 100644 kernels/portable/cpu/op_randn.cpp create mode 100644 kernels/test/op_rand_test.cpp create mode 100644 kernels/test/op_randn_test.cpp diff --git a/.Package.swift/kernels_portable/dummy.swift b/.Package.swift/kernels_portable/dummy.swift deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/.Package.swift/kernels_portable_debug/dummy.swift b/.Package.swift/kernels_portable_debug/dummy.swift deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt index 6982ba9c780..bb8caf3ffeb 100644 --- a/.ci/docker/ci_commit_pins/pytorch.txt +++ b/.ci/docker/ci_commit_pins/pytorch.txt @@ -1 +1 @@ -0d0058d90de410cbc998089eb5e475776d2ad55d +5616fa4a68718ead203314a3467f7dd9547153ae diff --git a/.ci/scripts/test_ios_ci.sh b/.ci/scripts/test_ios_ci.sh index 16f2e16de50..6908d61483c 100755 --- a/.ci/scripts/test_ios_ci.sh +++ b/.ci/scripts/test_ios_ci.sh @@ -42,10 +42,6 @@ say "Installing CoreML Backend Requirements" ./backends/apple/coreml/scripts/install_requirements.sh -say "Installing MPS Backend Requirements" - -./backends/apple/mps/install_requirements.sh - say "Exporting Models" python3 -m examples.portable.scripts.export --model_name="$MODEL_NAME" --segment_alignment=0x4000 diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh index 38c45dc3fb7..aa74f3a5447 100755 --- a/.ci/scripts/test_model.sh +++ b/.ci/scripts/test_model.sh @@ -49,14 +49,24 @@ prepare_artifacts_upload() { } build_cmake_executor_runner() { + local backend_string_select="${1:-}" echo "Building executor_runner" rm -rf ${CMAKE_OUTPUT_DIR} - cmake -DCMAKE_BUILD_TYPE=Debug \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ - -B${CMAKE_OUTPUT_DIR} . - - cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug + mkdir ${CMAKE_OUTPUT_DIR} + if [[ "$backend_string_select" == "XNNPACK" ]]; then + echo "Backend $backend_string_select selected" + (cd ${CMAKE_OUTPUT_DIR} \ + && cmake -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..) + cmake --build ${CMAKE_OUTPUT_DIR} -j4 + else + cmake -DCMAKE_BUILD_TYPE=Debug \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ + -B${CMAKE_OUTPUT_DIR} . + cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug + fi } run_portable_executor_runner() { @@ -111,19 +121,6 @@ test_model() { run_portable_executor_runner } -build_cmake_xnn_executor_runner() { - echo "Building xnn_executor_runner" - - (rm -rf ${CMAKE_OUTPUT_DIR} \ - && mkdir ${CMAKE_OUTPUT_DIR} \ - && cd ${CMAKE_OUTPUT_DIR} \ - && retry cmake -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..) - - cmake --build ${CMAKE_OUTPUT_DIR} -j4 -} - test_model_with_xnnpack() { WITH_QUANTIZATION=$1 WITH_DELEGATION=$2 @@ -148,12 +145,11 @@ test_model_with_xnnpack() { # Run test model if [[ "${BUILD_TOOL}" == "buck2" ]]; then + # TODO eventually buck should also use consolidated executor runners buck2 run //examples/xnnpack:xnn_executor_runner -- --model_path "${OUTPUT_MODEL_PATH}" elif [[ "${BUILD_TOOL}" == "cmake" ]]; then - if [[ ! -f ${CMAKE_OUTPUT_DIR}/backends/xnnpack/xnn_executor_runner ]]; then - build_cmake_xnn_executor_runner - fi - ./${CMAKE_OUTPUT_DIR}/backends/xnnpack/xnn_executor_runner --model_path "${OUTPUT_MODEL_PATH}" + build_cmake_executor_runner "XNNPACK" + ./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "${OUTPUT_MODEL_PATH}" else echo "Invalid build tool ${BUILD_TOOL}. Only buck2 and cmake are supported atm" exit 1 diff --git a/.github/scripts/label_utils.py b/.github/scripts/label_utils.py index 53daf222250..609316cfe2b 100644 --- a/.github/scripts/label_utils.py +++ b/.github/scripts/label_utils.py @@ -22,9 +22,7 @@ LABEL_ERR_MSG_TITLE = "This PR needs a `release notes:` label" LABEL_ERR_MSG = f"""# {LABEL_ERR_MSG_TITLE} -If your change should be included in the release notes (i.e. would users of this library care about this change?), please use a label starting with `release notes:`. - -If not, please add the `release notes: none` label. +If your change should be included in the release notes (i.e. would users of this library care about this change?), please use a label starting with `release notes:`. This helps us keep track and include your important work in the next release notes. To add a label, you can comment to pytorchbot, for example `@pytorchbot label "release notes: none"` diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py index 5a45089508a..124fc4ecbad 100755 --- a/.github/scripts/trymerge.py +++ b/.github/scripts/trymerge.py @@ -59,12 +59,7 @@ patterns_to_regex, retries_decorator, ) -from label_utils import ( - gh_add_labels, - gh_remove_label, - has_required_labels, - LABEL_ERR_MSG, -) +from label_utils import gh_add_labels, gh_remove_label from trymerge_explainer import get_revert_message, TryMergeExplainer # labels @@ -2116,9 +2111,6 @@ def merge( # Check for approvals find_matching_merge_rule(pr, repo, skip_mandatory_checks=True) - if not has_required_labels(pr): - raise RuntimeError(LABEL_ERR_MSG.lstrip(" #")) - if ignore_current: checks = pr.get_checkrun_conclusions() _, failing, _ = categorize_checks( diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml index ed8e21a8fb4..846dc576f43 100644 --- a/.github/workflows/apple-perf.yml +++ b/.github/workflows/apple-perf.yml @@ -188,11 +188,6 @@ jobs: backends/apple/coreml/scripts/install_requirements.sh fi - if [[ ${{ matrix.config }} == *"mps"* ]]; then - PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ - backends/apple/mps/install_requirements.sh - fi - # Install requirements for export_llama PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama/install_requirements.sh @@ -379,10 +374,6 @@ jobs: # Install CoreML Backend Requirements PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ backends/apple/coreml/scripts/install_requirements.sh - - # Install MPS Backend Requirements - PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ - backends/apple/mps/install_requirements.sh echo "::endgroup::" echo "::group::Build ExecuTorch iOS frameworks" diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml index 9c164ff5085..dcd4a0ab2a3 100644 --- a/.github/workflows/apple.yml +++ b/.github/workflows/apple.yml @@ -154,7 +154,6 @@ jobs: "backend_xnnpack" "kernels_custom" "kernels_optimized" - "kernels_portable" "kernels_quantized" "threadpool" ) @@ -169,10 +168,6 @@ jobs: PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ backends/apple/coreml/scripts/install_requirements.sh - # Install MPS Backend Requirements - PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ - backends/apple/mps/install_requirements.sh - # Build iOS Frameworks PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output scripts/build_apple_frameworks.sh @@ -307,10 +302,6 @@ jobs: # Install CoreML Backend Requirements PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ backends/apple/coreml/scripts/install_requirements.sh - - # Install MPS Backend Requirements - PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ - backends/apple/mps/install_requirements.sh echo "::endgroup::" echo "::group::Build ExecuTorch iOS frameworks" diff --git a/.github/workflows/check-labels.yml b/.github/workflows/check-labels.yml index 19c70c820a8..65da3052155 100644 --- a/.github/workflows/check-labels.yml +++ b/.github/workflows/check-labels.yml @@ -51,4 +51,4 @@ jobs: PR_NUM: ${{ github.event.number || github.event.inputs.pr_number }} run: | set -ex - python3 .github/scripts/check_labels.py --exit-non-zero "${PR_NUM}" + python3 .github/scripts/check_labels.py "${PR_NUM}" diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index b4ce196e8ad..cab558c9b56 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -305,7 +305,7 @@ jobs: # Install requirements ${CONDA_RUN} sh install_requirements.sh ${CONDA_RUN} sh backends/apple/coreml/scripts/install_requirements.sh - ${CONDA_RUN} python install_executorch.py --pybind coreml + ${CONDA_RUN} python install_executorch.py ${CONDA_RUN} sh examples/models/llama/install_requirements.sh # Test ANE llama @@ -414,11 +414,7 @@ jobs: # Setup executorch PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh --build-tool cmake - if [[ "${MODE}" == "mps" ]]; then - # Install mps delegate - PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/mps/install_requirements.sh - echo "Finishing installing mps." - elif [[ "${MODE}" == "coreml" ]]; then + if [[ "${MODE}" == "coreml" ]]; then # Install coreml delegate PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/coreml/scripts/install_requirements.sh echo "Finishing installing coreml." @@ -504,8 +500,6 @@ jobs: PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}" PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/coreml/scripts/install_requirements.sh echo "Finishing installing coreml." - PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/mps/install_requirements.sh - echo "Finishing installing mps." # Build and test coreml model MODELS=(mv3 ic4 resnet50 edsr mobilebert w2l) diff --git a/CMakeLists.txt b/CMakeLists.txt index 65a1eb50a77..10d4fcd95ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,13 +39,16 @@ # ~~~ # cmake-format -i CMakeLists.txt # ~~~ -# It should also be cmake-lint clean. +# It should also be checked with a linter via +# ~~~ +# cmake-lint CMakeLists.txt +# ~~~ # cmake_minimum_required(VERSION 3.24) project(executorch) -# MARK: - Start EXECUTORCH_H12025_BUILD_MIGRATION -------------------------------------------------- +# MARK: - Start EXECUTORCH_H12025_BUILD_MIGRATION include(${PROJECT_SOURCE_DIR}/tools/cmake/common/preset.cmake) include(${PROJECT_SOURCE_DIR}/tools/cmake/Utils.cmake) @@ -82,24 +85,25 @@ include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/default.cmake) # Print all the configs that were called with announce_configured_options. print_configured_options() -# MARK: - End EXECUTORCH_H12025_BUILD_MIGRATION ---------------------------------------------------- +# MARK: - End EXECUTORCH_H12025_BUILD_MIGRATION set(CMAKE_EXPORT_COMPILE_COMMANDS ON) -# Setup RPATH. -# See https://gitlab.kitware.com/cmake/community/-/wikis/doc/cmake/RPATH-handling +# Setup RPATH. See +# https://gitlab.kitware.com/cmake/community/-/wikis/doc/cmake/RPATH-handling # Use separate rpaths during build and install phases set(CMAKE_SKIP_BUILD_RPATH OFF) # Don't use the install-rpath during the build phase set(CMAKE_BUILD_WITH_INSTALL_RPATH ON) # Automatically add all linked folders that are NOT in the build directory to # the rpath (per library?) -# TODO: Doesn't work for us right now because we are not installing .so's into the -# correct locations. For example we have libcustom_ops_aot_lib.so depending on -# _portable_lib.so, which was eventually put under /executorch/extension/pybindings/ -# but this rpath is not automatically added because at build time it seems `portable_lib` -# is being built under the same directory, so no extra rpath is being added. To -# properly fix this we need to install `portable_lib` into the correct path. +# TODO: Doesn't work for us right now because we are +# not installing .so's into the correct locations. For example we have +# libcustom_ops_aot_lib.so depending on _portable_lib.so, which was eventually +# put under /executorch/extension/pybindings/ but this rpath is +# not automatically added because at build time it seems `portable_lib` is being +# built under the same directory, so no extra rpath is being added. To properly +# fix this we need to install `portable_lib` into the correct path. set(CMAKE_INSTALL_RPATH_USE_LINK_PATH ON) # ------------------------------ OPTIONS ------------------------------------- # WARNING: Please don't add example specific options in this CMakeLists.txt. @@ -177,7 +181,7 @@ endif() if(NOT DEFINED FXDIV_SOURCE_DIR) set(ORIGINAL_CMAKE_POSITION_INDEPENDENT_CODE_FLAG - ${CMAKE_POSITION_INDEPENDENT_CODE} + ${CMAKE_POSITION_INDEPENDENT_CODE} ) set(FXDIV_SOURCE_DIR "backends/xnnpack/third-party/FXdiv") add_subdirectory("${FXDIV_SOURCE_DIR}") @@ -276,7 +280,10 @@ if(NOT "${_repo_dir_name}" STREQUAL "executorch") "fix for this restriction." ) endif() -set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/runtime/core/portable_type/c10) +set(_common_include_directories + ${CMAKE_CURRENT_SOURCE_DIR}/.. + ${CMAKE_CURRENT_SOURCE_DIR}/runtime/core/portable_type/c10 +) # # The `__srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}. @@ -310,9 +317,9 @@ endif() # Detect if an Android toolchain is set. if(CMAKE_TOOLCHAIN_FILE MATCHES ".*android\.toolchain\.cmake$") set(CMAKE_TOOLCHAIN_ANDROID ON) -if(NOT ANDROID_PLATFORM) - set(ANDROID_PLATFORM android-30) -endif() + if(NOT ANDROID_PLATFORM) + set(ANDROID_PLATFORM android-30) + endif() else() set(CMAKE_TOOLCHAIN_ANDROID OFF) endif() @@ -334,7 +341,6 @@ if(EXECUTORCH_USE_CPP_CODE_COVERAGE) endif() endif() - # # program_schema: Generated .h files from schema/*.fbs inputs # @@ -376,7 +382,9 @@ endif() target_include_directories( executorch_core PUBLIC ${_common_include_directories} ) -target_compile_definitions(executorch_core PUBLIC C10_USING_CUSTOM_GENERATED_MACROS) +target_compile_definitions( + executorch_core PUBLIC C10_USING_CUSTOM_GENERATED_MACROS +) target_compile_options(executorch_core PUBLIC ${_common_compile_options}) if(MAX_KERNEL_NUM) target_compile_definitions( @@ -386,9 +394,7 @@ endif() if(EXECUTORCH_BUILD_PYBIND AND APPLE) # shared version - add_library( - executorch_core_shared SHARED ${_executorch_core__srcs} - ) + add_library(executorch_core_shared SHARED ${_executorch_core__srcs}) target_link_libraries(executorch_core_shared PRIVATE program_schema) if(DL_LIBRARY_EXISTS) # For dladdr() @@ -397,7 +403,9 @@ if(EXECUTORCH_BUILD_PYBIND AND APPLE) target_include_directories( executorch_core_shared PUBLIC ${_common_include_directories} ) - target_compile_definitions(executorch_core_shared PUBLIC C10_USING_CUSTOM_GENERATED_MACROS) + target_compile_definitions( + executorch_core_shared PUBLIC C10_USING_CUSTOM_GENERATED_MACROS + ) target_compile_options( executorch_core_shared PUBLIC ${_common_compile_options} ) @@ -430,9 +438,8 @@ target_link_options_shared_lib(executorch) # operators necessary for the models that will run. # if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) - # find pytorch lib here to make it available to all - # sub-directories. Find it before including portable so that - # optimized_portable_kernels can use it. + # find pytorch lib here to make it available to all sub-directories. Find it + # before including portable so that optimized_portable_kernels can use it. find_package_torch_headers() endif() @@ -458,19 +465,50 @@ endif() # Install `executorch` library as well as `executorch-config.cmake` under # ${CMAKE_INSTALL_PREFIX}/ -install(DIRECTORY runtime/core/ DESTINATION include/executorch/runtime/core FILES_MATCHING PATTERN "*.h") -install(DIRECTORY runtime/kernel/ DESTINATION include/executorch/runtime/kernel FILES_MATCHING PATTERN "*.h") -install(DIRECTORY runtime/platform/ DESTINATION include/executorch/runtime/platform FILES_MATCHING PATTERN "*.h") -install(DIRECTORY extension/kernel_util/ DESTINATION include/executorch/extension/kernel_util FILES_MATCHING PATTERN "*.h") -install(DIRECTORY extension/tensor/ DESTINATION include/executorch/extension/tensor FILES_MATCHING PATTERN "*.h") -install(DIRECTORY extension/threadpool/ DESTINATION include/executorch/extension/threadpool FILES_MATCHING PATTERN "*.h") +install( + DIRECTORY runtime/core/ + DESTINATION include/executorch/runtime/core + FILES_MATCHING + PATTERN "*.h" +) +install( + DIRECTORY runtime/kernel/ + DESTINATION include/executorch/runtime/kernel + FILES_MATCHING + PATTERN "*.h" +) +install( + DIRECTORY runtime/platform/ + DESTINATION include/executorch/runtime/platform + FILES_MATCHING + PATTERN "*.h" +) +install( + DIRECTORY extension/kernel_util/ + DESTINATION include/executorch/extension/kernel_util + FILES_MATCHING + PATTERN "*.h" +) +install( + DIRECTORY extension/tensor/ + DESTINATION include/executorch/extension/tensor + FILES_MATCHING + PATTERN "*.h" +) +install( + DIRECTORY extension/threadpool/ + DESTINATION include/executorch/extension/threadpool + FILES_MATCHING + PATTERN "*.h" +) install( TARGETS executorch executorch_core - DESTINATION lib INCLUDES DESTINATION ${_common_include_directories} ) -install(FILES tools/cmake/executorch-config.cmake DESTINATION lib/cmake/ExecuTorch) +install(FILES tools/cmake/executorch-config.cmake + DESTINATION lib/cmake/ExecuTorch +) if(EXECUTORCH_BUILD_ARM_BAREMETAL) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm) @@ -608,17 +646,14 @@ if(EXECUTORCH_BUILD_PYBIND) endif() if(EXECUTORCH_BUILD_XNNPACK) - # need to explicitly specify XNNPACK and microkernels-prod - # here otherwise uses XNNPACK and microkernel-prod symbols from libtorch_cpu + # need to explicitly specify XNNPACK and microkernels-prod here otherwise + # uses XNNPACK and microkernel-prod symbols from libtorch_cpu list(APPEND _dep_libs xnnpack_backend XNNPACK microkernels-prod) endif() # compile options for pybind - set(_pybind_compile_options - -Wno-deprecated-declarations - -fPIC - -frtti - -fexceptions + set(_pybind_compile_options -Wno-deprecated-declarations -fPIC -frtti + -fexceptions ) # util lib diff --git a/CMakePresets.json b/CMakePresets.json index 315084f59ae..9ea91fab343 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -15,7 +15,7 @@ "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/third-party/ios-cmake/ios.toolchain.cmake", "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/macos.cmake", "PLATFORM": "MAC_ARM64", - "DEPLOYMENT_TARGET": "10.15" + "DEPLOYMENT_TARGET": "12.0" }, "condition": { "lhs": "${hostSystemName}", @@ -77,7 +77,7 @@ "inherits": ["common"], "cacheVariables": { "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/pybind.cmake", - "CMAKE_OSX_DEPLOYMENT_TARGET": "10.15" + "CMAKE_OSX_DEPLOYMENT_TARGET": "12.0" }, "condition": { "type": "inList", @@ -93,7 +93,7 @@ ], "cacheVariables": { "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/llm.cmake", - "CMAKE_OSX_DEPLOYMENT_TARGET": "10.15" + "CMAKE_OSX_DEPLOYMENT_TARGET": "12.0" }, "condition": { "type": "inList", diff --git a/Package.swift b/Package.swift index 6ee8debc413..43760822c19 100644 --- a/Package.swift +++ b/Package.swift @@ -22,7 +22,27 @@ import PackageDescription let debug_suffix = "_debug" let dependencies_suffix = "_with_dependencies" -let products = [ +func deliverables(_ dict: [String: [String: Any]]) -> [String: [String: Any]] { + dict + .reduce(into: [String: [String: Any]]()) { result, pair in + let (key, value) = pair + result[key] = value + result[key + debug_suffix] = value + } + .reduce(into: [String: [String: Any]]()) { result, pair in + let (key, value) = pair + var newValue = value + if key.hasSuffix(debug_suffix) { + for (k, v) in value where k.hasSuffix(debug_suffix) { + let trimmed = String(k.dropLast(debug_suffix.count)) + newValue[trimmed] = v + } + } + result[key] = newValue.filter { !$0.key.hasSuffix(debug_suffix) } + } +} + +let products = deliverables([ "backend_coreml": [ "frameworks": [ "Accelerate", @@ -58,50 +78,52 @@ let products = [ "threadpool", ], ], - "kernels_portable": [:], "kernels_quantized": [:], -].reduce(into: [String: [String: Any]]()) { - $0[$1.key] = $1.value - $0[$1.key + debug_suffix] = $1.value +]) + +let targets = deliverables([ + "threadpool": [:], +]) + +let packageProducts: [Product] = products.keys.map { key -> Product in + .library(name: key, targets: ["\(key)\(dependencies_suffix)"]) +}.sorted { $0.name < $1.name } + +var packageTargets: [Target] = [] + +for (key, value) in targets { + packageTargets.append(.binaryTarget( + name: key, + path: "cmake-out/\(key).xcframework" + )) } -let targets = [ - "threadpool", -].flatMap { [$0, $0 + debug_suffix] } +for (key, value) in products { + packageTargets.append(.binaryTarget( + name: key, + path: "cmake-out/\(key).xcframework" + )) + let target: Target = .target( + name: "\(key)\(dependencies_suffix)", + dependencies: ([key] + (value["targets"] as? [String] ?? []).map { + key.hasSuffix(debug_suffix) ? $0 + debug_suffix : $0 + }).map { .target(name: $0) }, + path: ".Package.swift/\(key)", + linkerSettings: + (value["frameworks"] as? [String] ?? []).map { .linkedFramework($0) } + + (value["libraries"] as? [String] ?? []).map { .linkedLibrary($0) } + ) + packageTargets.append(target) +} let package = Package( name: "executorch", platforms: [ .iOS(.v17), - .macOS(.v10_15), + .macOS(.v12), ], - products: products.keys.map { key in - .library(name: key, targets: ["\(key)\(dependencies_suffix)"]) - }.sorted { $0.name < $1.name }, - targets: targets.map { key in - .binaryTarget( - name: key, - path: "cmake-out/\(key).xcframework" - ) - } + products.flatMap { key, value -> [Target] in - [ - .binaryTarget( - name: key, - path: "cmake-out/\(key).xcframework" - ), - .target( - name: "\(key)\(dependencies_suffix)", - dependencies:([key] + - (value["targets"] as? [String] ?? []).map { - target in key.hasSuffix(debug_suffix) ? target + debug_suffix : target - }).map { .target(name: $0) }, - path: ".Package.swift/\(key)", - linkerSettings: - (value["frameworks"] as? [String] ?? []).map { .linkedFramework($0) } + - (value["libraries"] as? [String] ?? []).map { .linkedLibrary($0) } - ), - ] - } + [ + products: packageProducts, + targets: packageTargets + [ .testTarget( name: "tests", dependencies: [ diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm index 63bc60695ce..8f36087dcc6 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm @@ -27,7 +27,7 @@ - (instancetype)initWithModel:(ETCoreMLModel *)model { eventLogger:(const executorchcoreml::ModelEventLogger* _Nullable __unused)eventLogger error:(NSError * __autoreleasing *)error { if (self.ignoreOutputBackings) { - if (@available(macOS 11.0, iOS 16.0, tvOS 16.0, watchOS 9.0, *)) { + if (@available(iOS 16.0, tvOS 16.0, watchOS 9.0, *)) { predictionOptions.outputBackings = @{}; } } diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm index a64d977bb26..f4cfd2146ac 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm @@ -92,7 +92,7 @@ BOOL is_backed_by_same_buffer(MLMultiArray *array1, MLMultiArray *array2) { NSOrderedSet *output_names, NSError * __autoreleasing *error) { MLPredictionOptions *options = [MLPredictionOptions new]; - if (@available(macOS 11.0, iOS 16.0, tvOS 16.0, watchOS 9.0, *)) { + if (@available(iOS 16.0, tvOS 16.0, watchOS 9.0, *)) { NSMutableDictionary *output_backings = [NSMutableDictionary dictionary]; NSEnumerator *enumerator = [output_names objectEnumerator]; for (MLMultiArray *output in outputs) { @@ -687,7 +687,7 @@ - (void)addPrewarmedAsset:(ETCoreMLAsset *)asset { eventLogger:eventLogger error:&localError]; // Try without output backings. - if (@available(macOS 11.0, iOS 16.0, tvOS 16.0, watchOS 9.0, *)) { + if (@available(iOS 16.0, tvOS 16.0, watchOS 9.0, *)) { if (!modelOutputs && predictionOptions.outputBackings.count > 0) { executor.ignoreOutputBackings = YES; localError = nil; diff --git a/backends/apple/mps/install_requirements.sh b/backends/apple/mps/install_requirements.sh deleted file mode 100755 index 1bc663d9d6d..00000000000 --- a/backends/apple/mps/install_requirements.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -# -# Copyright (c) 2023 Apple Inc. All rights reserved. -# Provided subject to the LICENSE file in the top level directory. -# - -# Install required python dependencies for using the MPS Backend -pip install --force-reinstall ninja diff --git a/backends/apple/mps/setup.md b/backends/apple/mps/setup.md index b35983514db..0ecb4151e61 100644 --- a/backends/apple/mps/setup.md +++ b/backends/apple/mps/setup.md @@ -42,12 +42,6 @@ In order to be able to successfully build and run a model using the MPS backend ***Step 1.*** Please finish tutorial [Setting up ExecuTorch](https://pytorch.org/executorch/main/getting-started-setup). -***Step 2.*** Install dependencies needed to lower MPS delegate: - - ```bash - ./backends/apple/mps/install_requirements.sh - ``` - ## Build ### AOT (Ahead-of-time) Components @@ -97,7 +91,7 @@ I 00:00:00.122615 executorch:mps_executor_runner.mm:501] Model verified successf ### [Optional] Run the generated model directly using pybind 1. Make sure `pybind` MPS support was installed: ```bash -./install_executorch.sh --pybind mps +CMAKE_ARGS="-DEXECUTORCH_BUILD_MPS=ON" ./install_executorch.sh ``` 2. Run the `mps_example` script to trace the model and run it directly from python: ```bash diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py index 22bf7f4c013..f207d85ebd7 100644 --- a/backends/arm/_passes/__init__.py +++ b/backends/arm/_passes/__init__.py @@ -23,6 +23,7 @@ from .decompose_cosine_similarity_pass import DecomposeCosineSimilarityPass # noqa from .decompose_div_pass import DecomposeDivPass # noqa from .decompose_gelu_pass import DecomposeGeluPass # noqa +from .decompose_groupnorm_pass import DecomposeGroupNormPass # noqa from .decompose_layernorm_pass import DecomposeLayerNormPass # noqa from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass # noqa from .decompose_linalg_vector_norm_pass import DecomposeLinearVectorNormPass # noqa diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py index d2ab9dcb9ef..02a4edd398b 100644 --- a/backends/arm/_passes/arm_pass_manager.py +++ b/backends/arm/_passes/arm_pass_manager.py @@ -27,6 +27,7 @@ DecomposeCosineSimilarityPass, DecomposeDivPass, DecomposeGeluPass, + DecomposeGroupNormPass, DecomposeLayerNormPass, DecomposeLeakyReLUPass, DecomposeLinearPass, @@ -141,6 +142,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul self.add_pass(ConvertMmToBmmPass()) self.add_pass(DecomposeLinearPass()) self.add_pass(DecomposeLeakyReLUPass()) + self.add_pass(DecomposeGroupNormPass()) self.add_pass(DecomposeLayerNormPass()) self.add_pass(DecomposeVarPass()) self.add_pass( @@ -208,6 +210,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule): self.add_pass(DecomposeScaledDotProductAttention()) self.add_pass(ReplaceScalarWithTensorArgPassTOSABI()) self.add_pass(ScalarsToAttributePass()) + self.add_pass(DecomposeGroupNormPass()) self.add_pass(DecomposeLayerNormPass()) self.add_pass(DecomposeVarPass()) self.add_pass(DecomposeMeanDimPass(graph_module, self.tosa_spec)) diff --git a/backends/arm/_passes/decompose_groupnorm_pass.py b/backends/arm/_passes/decompose_groupnorm_pass.py new file mode 100644 index 00000000000..c6cb1b05e40 --- /dev/null +++ b/backends/arm/_passes/decompose_groupnorm_pass.py @@ -0,0 +1,208 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +import operator + +import torch +from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes.arm_pass_utils import create_node +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import PassResult + + +def get_group_norm_decomposition(op) -> tuple: + if op == exir_ops.edge.aten.native_group_norm.default: + return ( + exir_ops.edge.aten.mean.dim, + exir_ops.edge.aten.sub.Tensor, + exir_ops.edge.aten.var.correction, + exir_ops.edge.aten.full.default, + exir_ops.edge.aten.add.Tensor, + exir_ops.edge.aten.rsqrt.default, + exir_ops.edge.aten.mul.Tensor, + exir_ops.edge.aten.view_copy.default, + ) + if op == torch.ops.aten.group_norm.default: + return ( + torch.ops.aten.mean.dim, + torch.ops.aten.sub.Tensor, + torch.ops.aten.var.correction, + torch.ops.aten.full.default, + torch.ops.aten.add.Tensor, + torch.ops.aten.rsqrt.default, + torch.ops.aten.mul.Tensor, + torch.ops.aten.view_copy.default, + ) + raise RuntimeError(f"Can't get group_norm composition for op {op}") + + +class DecomposeGroupNormPass(ArmPass): + """ + groupnorm is defined as: ((x - E[x]) / sqrt(Var[x] + eps)) * weights + bias + Decompose groupnorm(x, weight, bias, N, C, HxW, group, eps) to a sequence of: + mean = op_mean(x, dims) # E[x] + var = op_var(x, dims) # Var[x] + numerator = op_sub(x, mean) # (x - E[x]) + add = op_add(var, eps) # Var[x] + eps + rsqrt = op_rsqrt(add) # 1 / sqrt(Var[x] + eps) + mul = op_mul(numerator, rsqrt) # ((x - E[x]) / sqrt(Var[x] + eps)) + weigths = op_mul(mul, weigths) # ((x - E[x]) / sqrt(Var[x] + eps)) * weigths + bias = op_add(weigths, bias) # ((x - E[x]) / sqrt(Var[x] + eps)) * weigths + bias + where x can viewed with shape [N, group, C//group, HxW] dims=[C//group, HxW] + + Source: https://pytorch.org/docs/stable/generated/torch.nn.GroupNorm.html + """ + + def call(self, graph_module: torch.fx.GraphModule): + modified = False + for node in graph_module.graph.nodes: + if node.op != "call_function" or node.target not in ( + exir_ops.edge.aten.native_group_norm.default, + torch.ops.aten.group_norm.default, + ): + continue + + # epsilon default value + eps = torch.finfo().eps + weights = None + bias = None + args = node.args + meta = node.meta + if isinstance(meta["val"], tuple): + shape = meta["val"][0].size() + dtype = meta["val"][0].dtype + else: + shape = meta["val"].size() + dtype = meta["val"].dtype + match len(args): + # MI profile always provides all the args: x, weight, bias, N, C, HxW, group, eps + case 8: + x, weights, bias, N, C, HxW, group, eps = args + # BI profile: affine=[True|False], eps!=1e-5 + case 5: + x, group, weights, bias, eps = args + # BI profile: affine=True, eps=1e-5 + case 4: + x, group, weights, bias = args + # BI profile: affine=False, eps=1e=5 + case 2: + x, group = args + # Unsupported args + case _: + raise ValueError( + f"Unsupported group_norm argument pattern with {len(args)} args" + ) + N = shape[0] + C = shape[1] + HxW = 1 + for dim in shape[2:]: + HxW *= dim + channels_per_group = C // group + grouped_shape = torch.Size([N, group, channels_per_group, HxW]) + dims = [2, 3] + epsilon_reshaped_shape = torch.Size([1] * len(grouped_shape)) + weights_reshaped_shape = torch.Size([1, group, channels_per_group, 1]) + ( + mean_op, + sub_op, + var_op, + full_op, + add_op, + rsqrt_op, + mul_op, + view_op, + ) = get_group_norm_decomposition(node.target) + with graph_module.graph.inserting_before(node): + keepdim = True + x_reshaped = create_node( + graph_module.graph, + view_op, + args=(x, grouped_shape), + from_node=node, + ) + mean = create_node( + graph_module.graph, mean_op, args=(x_reshaped, dims, keepdim) + ) + sub = create_node(graph_module.graph, sub_op, args=(x_reshaped, mean)) + var = create_node( + graph_module.graph, + var_op, + args=(x_reshaped, dims), + kwargs={"correction": 0, "keepdim": keepdim}, + from_node=node, + ) + full = create_node( + graph_module.graph, + full_op, + args=(epsilon_reshaped_shape, eps), + kwargs={"dtype": dtype}, + from_node=node, + ) + add0 = create_node( + graph_module.graph, add_op, args=(var, full), from_node=node + ) + rsqrt = create_node( + graph_module.graph, rsqrt_op, args=(add0,), from_node=node + ) + mul0 = create_node( + graph_module.graph, mul_op, args=(sub, rsqrt), from_node=node + ) + if weights is not None: + weights_reshaped = create_node( + graph_module.graph, + view_op, + args=(weights, weights_reshaped_shape), + from_node=node, + ) + mul1 = create_node( + graph_module.graph, + mul_op, + args=( + mul0, + weights_reshaped, + ), + from_node=node, + ) + else: + mul1 = mul0 + if bias is not None: + bias_reshaped_shape = weights_reshaped_shape + bias_reshaped = create_node( + graph_module.graph, + view_op, + args=(bias, bias_reshaped_shape), + from_node=node, + ) + output = create_node( + graph_module.graph, + add_op, + args=(mul1, bias_reshaped), + from_node=node, + ) + else: + output = mul1 + + output_reshaped = create_node( + graph_module.graph, + view_op, + args=(output, shape), + from_node=node, + ) + + users = [user for user in node.users if node != user] + node.replace_all_uses_with(output_reshaped) + for user in users: + if user.target == operator.getitem: + user.replace_all_uses_with(output_reshaped) + graph_module.graph.erase_node(node) + graph_module.graph.eliminate_dead_code() + modified = True + if modified: + graph_module.recompile() + graph_module = super().call(graph_module).graph_module + + return PassResult(graph_module, modified) diff --git a/backends/arm/_passes/decompose_layernorm_pass.py b/backends/arm/_passes/decompose_layernorm_pass.py index a92434faa7d..e6cbdfb91a0 100644 --- a/backends/arm/_passes/decompose_layernorm_pass.py +++ b/backends/arm/_passes/decompose_layernorm_pass.py @@ -1,5 +1,4 @@ # Copyright 2024-2025 Arm Limited and/or its affiliates. -# All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -47,11 +46,12 @@ class DecomposeLayerNormPass(ArmPass): Decompose layernorm(x, normalized_shape, weights, bias, eps) to a sequence of: mean = op_mean(x, dims) # E[x] var = op_var(x, dims) # Var[x] - denominator = op_sub(x, mean) # (x - E[x]) + numerator = op_sub(x, mean) # (x - E[x]) add = op_add(var, eps) # Var[x] + eps rsqrt = op_rsqrt(add) # 1 / sqrt(Var[x] + eps) - mul = op_mul(denominator, rsqrt) # ((x - E[x]) / sqrt(Var[x] + eps)) * weigths - bias = op_add(mul, bias) # ((x - E[x]) / sqrt(Var[x] + eps)) * weigths + bias + mul = op_mul(numerator, rsqrt) # ((x - E[x]) / sqrt(Var[x] + eps)) + weigths = op_mul(mul, weigths) # ((x - E[x]) / sqrt(Var[x] + eps)) * weigths + bias = op_add(weigths, bias) # ((x - E[x]) / sqrt(Var[x] + eps)) * weigths + bias Source: https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html """ diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py index 766e98688d9..36ae77d26a3 100644 --- a/backends/arm/operator_support/tosa_supported_operators.py +++ b/backends/arm/operator_support/tosa_supported_operators.py @@ -198,6 +198,7 @@ def is_node_supported( exir_ops.edge.aten.div.Scalar, exir_ops.edge.aten._native_batch_norm_legit_no_training.default, exir_ops.edge.aten.native_layer_norm.default, + exir_ops.edge.aten.native_group_norm.default, exir_ops.edge.aten.sigmoid.default, exir_ops.edge.aten.mean.dim, exir_ops.edge.aten.mm.default, @@ -264,6 +265,7 @@ def is_node_supported( exir_ops.edge.aten.div.Tensor: None, exir_ops.edge.aten._native_batch_norm_legit_no_training.default: "BatchNorm2D with track_running_stats==True not immediately following a convolution is not supported for quantized TOSA backends.", exir_ops.edge.aten.native_layer_norm.default: None, + exir_ops.edge.aten.native_group_norm.default: None, exir_ops.edge.aten._softmax.default: None, exir_ops.edge.aten._log_softmax.default: None, exir_ops.edge.aten.var.correction: None, diff --git a/backends/arm/operators/op_conv2d.py b/backends/arm/operators/op_conv2d.py index fdbb20fbe18..a566b0fbfa7 100644 --- a/backends/arm/operators/op_conv2d.py +++ b/backends/arm/operators/op_conv2d.py @@ -6,7 +6,6 @@ # pyre-unsafe from typing import Any, List -import numpy as np import torch from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import ( @@ -333,21 +332,22 @@ def define_node( weight.dtype, ) shape = tosa_graph.addConst( - np.array(weight_post_shape).shape, + [len(weight_post_shape)], ts.DType.SHAPE, - np.array(weight_post_shape), + weight_post_shape, name=weight_reshaped.name + "_shape", ) - attr = ts.TosaSerializerAttribute() - attr.ReshapeAttribute() + reshape_attr = ts.TosaSerializerAttribute() + reshape_attr.ReshapeAttribute() tosa_graph.addOperator( ts.TosaOp.Op().RESHAPE, [weight.name, shape.name], [weight_reshaped.name], - attr, + reshape_attr, ) + attr = ts.TosaSerializerAttribute() tosa_op = ts.TosaOp.Op().DEPTHWISE_CONV2D weight_name = weight_reshaped.name diff --git a/backends/arm/operators/op_view.py b/backends/arm/operators/op_view.py index e7a062bbf22..d8ac85ec63a 100644 --- a/backends/arm/operators/op_view.py +++ b/backends/arm/operators/op_view.py @@ -74,14 +74,14 @@ def define_node( tosa_graph = cast(ts.TosaSerializer, tosa_graph) if len(output.shape) != 0: - shape_len = len(output.shape) + shape_len = [len(output.shape)] shape_data = list(tosa_shape(output.shape, output.dim_order)) else: - shape_len = 1 - shape_data = [0] + shape_len = [] + shape_data = [] shape = tosa_graph.addConst( - [shape_len], + shape_len, ts.DType.SHAPE, shape_data, name=node.name + "_shape", diff --git a/backends/arm/scripts/parse_test_names.py b/backends/arm/scripts/parse_test_names.py index 62ff93ebc91..d4342e428b6 100644 --- a/backends/arm/scripts/parse_test_names.py +++ b/backends/arm/scripts/parse_test_names.py @@ -16,6 +16,7 @@ "adaptive_avg_pool2d.default", "bitwise_right_shift.Tensor", "bitwise_left_shift.Tensor", + "native_group_norm.default", "_native_batch_norm_legit_no_training.default", "_native_batch_norm_legit.no_stats", ] diff --git a/backends/arm/scripts/pre-push b/backends/arm/scripts/pre-push index b9a5b567a42..a4e877fdcfc 100755 --- a/backends/arm/scripts/pre-push +++ b/backends/arm/scripts/pre-push @@ -4,9 +4,9 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -# Calling this script with any argument is equal to launching it in +# Calling this script with one argument is equal to launching it in # non-interactive mode. "$#" gives the number of positional arguments. -[ "$#" -eq 0 ] && is_script_interactive=1 || is_script_interactive=0 +[ "$#" -eq 1 ] && is_script_interactive=1 || is_script_interactive=0 if [ $is_script_interactive -eq 1 ]; then RESET='\e[0m' diff --git a/backends/arm/test/ops/test_group_norm.py b/backends/arm/test/ops/test_group_norm.py new file mode 100644 index 00000000000..9c5517d9dae --- /dev/null +++ b/backends/arm/test/ops/test_group_norm.py @@ -0,0 +1,145 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.test_pipeline import ( + EthosU55PipelineBI, + EthosU85PipelineBI, + TosaPipelineBI, + TosaPipelineMI, +) + + +class GroupNorm(torch.nn.Module): + + def __init__( + self, + num_groups: int, + num_channels: int, + eps: float = 1e-5, + affine: bool = True, + ): + super().__init__() + self.group_norm = torch.nn.GroupNorm( + num_groups, + num_channels, + eps=eps, + affine=affine, + ) + + def forward( + self, + x: torch.Tensor, + ): + return self.group_norm(x) + + +input_t = tuple[torch.Tensor] +test_data_suite = { + "rand_4_6_groups_1": ((torch.rand(4, 6),), GroupNorm(1, 6)), + "rand_4_6_groups_2": ((torch.rand(4, 6),), GroupNorm(2, 6)), + "rand_4_6_groups_6": ((torch.rand(4, 6),), GroupNorm(6, 6)), + "rand_4_6_8_groups_2_eps_no_affine": ( + (torch.rand(4, 6, 8),), + GroupNorm(2, 6, eps=1e-3, affine=False), + ), + "randn_1_12_8_6_groups_6_eps": ( + (torch.randn(1, 12, 8, 6),), + GroupNorm(6, 12, eps=1e-2), + ), + "randn_1_12_8_6_groups_12": ((torch.randn(1, 12, 8, 6),), GroupNorm(12, 12)), + "rand_6_8_10_12_groups_1": ((torch.rand(6, 8, 10, 12),), GroupNorm(1, 8)), + "rand_6_8_10_12_groups_4_no_affine": ( + (torch.rand(6, 8, 10, 12),), + GroupNorm(4, 8, affine=False), + ), + "rand_6_8_10_12_groups_8": ((torch.rand(6, 8, 10, 12),), GroupNorm(8, 8)), +} + + +@common.parametrize("test_data", test_data_suite) +def test_native_group_norm_tosa_MI(test_data): + aten_op = "torch.ops.aten.group_norm.default" + exir_op = "executorch_exir_dialects_edge__ops_aten_native_group_norm_default" + pipeline = TosaPipelineMI[input_t]( + test_data[1], + test_data[0], + aten_op=aten_op, + exir_op=exir_op, + ) + pipeline.run() + + +@common.parametrize( + "test_data", + test_data_suite, + xfails={ + "randn_1_12_8_6_groups_12": "MLETORCH-925: Fix numerical issue for aten.native_group_norm", + "rand_6_8_10_12_groups_1": "MLETORCH-925: Fix numerical issue for aten.native_group_norm", + "rand_6_8_10_12_groups_4_no_affine": "MLETORCH-925: Fix numerical issue for aten.native_group_norm", + "rand_6_8_10_12_groups_8": "MLETORCH-925: Fix numerical issue for aten.native_group_norm", + }, + strict=False, +) +def test_native_group_norm_tosa_BI(test_data): + aten_op = "torch.ops.aten.sub.Tensor" # 'sub' op arbitrarily chosen to confirm groupnorm was decomposed + exir_op = "executorch_exir_dialects_edge__ops_aten_native_group_norm_default" + pipeline = TosaPipelineBI[input_t]( + test_data[1], + test_data[0], + aten_op=aten_op, + exir_op=exir_op, + atol=0.1, # TODO: "MLETORCH-925: Fix numerical issue for aten.native_group_norm" + ) + pipeline.run() + + +@common.parametrize( + "test_data", + test_data_suite, + xfails={ + "randn_1_12_8_6_groups_12": "MLETORCH-925: Fix numerical issue for aten.native_group_norm", + "rand_6_8_10_12_groups_1": "MLETORCH-925: Fix numerical issue for aten.native_group_norm", + "rand_6_8_10_12_groups_4_no_affine": "MLETORCH-925: Fix numerical issue for aten.native_group_norm", + "rand_6_8_10_12_groups_8": "MLETORCH-925: Fix numerical issue for aten.native_group_norm", + }, + strict=False, +) +@common.XfailIfNoCorstone300 +def test_native_group_norm_u55_BI(test_data): + pipeline = EthosU55PipelineBI[input_t]( + test_data[1], + test_data[0], + "torch.ops.aten.sub.Tensor", # 'sub' op arbitrarily chosen to confirm groupnorm was decomposed + run_on_fvp=True, + atol=0.1, # TODO: "MLETORCH-925: Fix numerical issue for aten.native_group_norm" + ) + pipeline.change_args("run_method_and_compare_outputs", atol=1, qtol=1) + pipeline.run() + + +@common.parametrize( + "test_data", + test_data_suite, + xfails={ + "randn_1_12_8_6_groups_12": "MLETORCH-925: Fix numerical issue for aten.native_group_norm", + "rand_6_8_10_12_groups_1": "MLETORCH-925: Fix numerical issue for aten.native_group_norm", + "rand_6_8_10_12_groups_4_no_affine": "MLETORCH-925: Fix numerical issue for aten.native_group_norm", + "rand_6_8_10_12_groups_8": "MLETORCH-925: Fix numerical issue for aten.native_group_norm", + }, + strict=False, +) +@common.XfailIfNoCorstone320 +def test_native_group_norm_u85_BI(test_data): + pipeline = EthosU85PipelineBI[input_t]( + test_data[1], + test_data[0], + "torch.ops.aten.sub.Tensor", # 'sub' op arbitrarily chosen to confirm groupnorm was decomposed + run_on_fvp=True, + atol=0.1, # TODO: "MLETORCH-925: Fix numerical issue for aten.native_group_norm" + ) + pipeline.change_args("run_method_and_compare_outputs", atol=1, qtol=1) + pipeline.run() diff --git a/backends/arm/test/ops/test_linalg_vector_norm.py b/backends/arm/test/ops/test_linalg_vector_norm.py index 36533d786dd..27e4bef97e6 100644 --- a/backends/arm/test/ops/test_linalg_vector_norm.py +++ b/backends/arm/test/ops/test_linalg_vector_norm.py @@ -72,7 +72,6 @@ def test_vector_norm_tosa_MI(test_module): pipeline = TosaPipelineMI[input_t](model, input_tensor, aten_op, exir_op) - pipeline.change_args("run_method_and_compare_outputs", qtol=1, atol=1e-4, rtol=1e-4) pipeline.run() @@ -90,7 +89,6 @@ def test_vector_norm_tosa_BI(test_module): exir_op, symmetric_io_quantization=True, ) - pipeline.change_args("run_method_and_compare_outputs", qtol=1, atol=1, rtol=1) pipeline.run() @@ -107,13 +105,12 @@ def test_vector_norm_u55_BI_fvp(test_module): run_on_fvp=True, symmetric_io_quantization=True, ) - pipeline.change_args("run_method_and_compare_outputs", qtol=1, atol=1, rtol=1) pipeline.pop_stage("check_not.exir") pipeline.run() @common.parametrize("test_module", test_modules) -@common.XfailIfNoCorstone300 +@common.XfailIfNoCorstone320 def test_vector_norm_u85_BI_fvp(test_module): model, input_tensor = test_module @@ -126,6 +123,5 @@ def test_vector_norm_u85_BI_fvp(test_module): run_on_fvp=True, symmetric_io_quantization=True, ) - pipeline.change_args("run_method_and_compare_outputs", qtol=1, atol=1, rtol=1) pipeline.pop_stage("check_not.exir") pipeline.run() diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS index 6b2b61729ed..1613cfb28ca 100644 --- a/backends/cadence/aot/TARGETS +++ b/backends/cadence/aot/TARGETS @@ -433,6 +433,7 @@ python_unittest( srcs = [ "tests/test_memory_passes.py", ], + supports_static_listing = False, typing = True, deps = [ ":compiler", @@ -441,7 +442,9 @@ python_unittest( ":pass_utils", "//caffe2:torch", "//executorch/exir:memory", + "fbsource//third-party/pypi/parameterized:parameterized", "//executorch/exir/dialects:lib", + "//executorch/backends/cadence/aot:graph_builder", "//executorch/exir/tests:models", ], ) diff --git a/backends/cadence/aot/memory_constraints.py b/backends/cadence/aot/memory_constraints.py index 3de140e4647..377e6fc81e6 100644 --- a/backends/cadence/aot/memory_constraints.py +++ b/backends/cadence/aot/memory_constraints.py @@ -350,14 +350,28 @@ def is_slice_view(self, node: torch.fx.Node) -> bool: def is_cat_along_outermost_dim( self, graph_module: torch.fx.GraphModule, cat_node: torch.fx.Node ) -> bool: + assert len(cat_node.args) > 0 + cat_tensors = cat_node.args[0] + if not isinstance(cat_tensors, Sequence) or not all( + isinstance(t, torch.fx.Node) for t in cat_tensors + ): + raise ValueError("cat_tensors must be a sequence of torch.fx.Node objects.") + + if len(cat_node.args) > 1: + cat_dim = cat_node.args[1] + else: + cat_dim = cat_node.kwargs.get("dim", None) + if not isinstance(cat_dim, int): + raise ValueError("cat_dim must be an integer.") + # If the cat op has default dim, then the concat dim is 0 - if len(cat_node.args) == 1 or cat_node.args[1] == 0: + if len(cat_tensors) == 1 or cat_dim == 0: return True - # Get the concatenation dimension and concatenated tensors - (cat_tensors, cat_dim) = cast( - tuple[Sequence[torch.fx.Node], int], cat_node.args - ) + + # Make sure all dimes before cat_dim are 1. for tensor in cat_tensors: + if not isinstance(tensor, torch.fx.Node): + continue shape = get_shape(graph_module, tensor) if shape is None or not all(dim == 1 for dim in shape[0:cat_dim]): return False diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py index cdaca41569f..4a6edf03c0e 100644 --- a/backends/cadence/aot/ops_registrations.py +++ b/backends/cadence/aot/ops_registrations.py @@ -167,6 +167,13 @@ "where_Scalar.out(Tensor condition, float self, float other, *, Tensor(a!) out) -> Tensor(a!)" ) +lib.define( + "rope(Tensor input, Tensor sin_tensor, Tensor cos_tensor, Tensor? pos) -> (Tensor out)" +) +lib.define( + "rope.out(Tensor input, Tensor sin_tensor, Tensor cos_tensor, Tensor? pos, *, Tensor(a!) out) -> Tensor(a!)" +) + # ------------------------------------ # # Migrated from custom_ops.yaml # # ------------------------------------ # @@ -954,3 +961,29 @@ def where_Scalar_meta( other: float, ) -> torch.Tensor: return condition.new_empty(condition.size(), dtype=torch.float32) + + +@register_fake("cadence::rope") +def rope_meta( + input: torch.Tensor, + sin_tensor: torch.Tensor, + cos_tensor: torch.Tensor, + pos: Optional[torch.Tensor], +) -> torch.Tensor: + input_shape = list(input.shape) + assert ( + len(input_shape) in (4, 5) and input_shape[0] == 1 + ), f"input shape {input_shape} must be (1, seq, h, hd) or (1, seq, h, hd / 2, 2)" + seq = input_shape[1] + h = input_shape[2] + hd = prod(input_shape) / (seq * h) + sin_shape = list(sin_tensor.shape) + cos_shape = list(cos_tensor.shape) + assert sin_shape == cos_shape, f"{sin_shape=} must be same as {cos_shape}" + assert ( + len(sin_shape) == 2 and sin_shape[-1] == hd // 2 + ), f"{sin_shape=} must be [seq, hd/2]" + assert ( + pos is None or len(pos.shape) == 1 and pos.shape[0] == seq + ), f"{pos.shape} must be [{seq}]" + return input.new_empty(input.shape, dtype=input.dtype) diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py index 358ec1d6a4b..e5a88c10a3f 100644 --- a/backends/cadence/aot/replace_ops.py +++ b/backends/cadence/aot/replace_ops.py @@ -939,8 +939,8 @@ def replace_conv_with_nhwc_conv(self, graph_module: torch.fx.GraphModule): # This pass needs to be reworked to be compatible with PT2. It is an optimization # pass anyway, so move it to opt level 2. -# TODO(matthiascremon): update and improve this pass. -@register_cadence_pass(CadencePassAttribute(opt_level=2)) +# TODO: T213724613 update and improve this pass. +# @register_cadence_pass(CadencePassAttribute(opt_level=2)) class ReplaceConvWithChannelLastConvPass(ExportPass): """ Replace the ATen convolution op with custom conv op with NCHW or NHWC layout @@ -2065,11 +2065,10 @@ def call_operator( return super().call_operator(op, args, kwargs, meta) -@register_cadence_pass(CadencePassAttribute(opt_level=2)) -class ReplaceGeluWithApproximateGeluPass(ExportPass): +@register_cadence_pass(CadencePassAttribute(opt_level=0)) +class ReplaceAtenApproxGeluWithApproxGeluPass(ExportPass): """ - Replace the gelu op with an approximate gelu op. The approximate gelu op - is more efficient on DSP backends. + Replace the aten gelu op with an approximate arg with an approximate gelu op. """ def call_operator( @@ -2079,6 +2078,9 @@ def call_operator( kwargs: Dict[str, Argument], meta: NodeMetadata, ) -> ProxyValue: + if "approximate" not in kwargs: + return super().call_operator(op, args, kwargs, meta) + if op not in { exir_ops.edge.aten.gelu.default, }: @@ -2414,7 +2416,7 @@ class CadenceReplaceOpsInGraph: ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass, ReplaceAtenAvgPoolWithJarvisAvgPoolPass, ReplaceWhereWithFullArgsWithWhereScalar, - ReplaceGeluWithApproximateGeluPass, + ReplaceAtenApproxGeluWithApproxGeluPass, ReplaceSplitWithSlicePass, ReplacePowWithMulPass, ] diff --git a/backends/cadence/aot/tests/test_memory_passes.py b/backends/cadence/aot/tests/test_memory_passes.py index c32809c2bff..d220007e227 100644 --- a/backends/cadence/aot/tests/test_memory_passes.py +++ b/backends/cadence/aot/tests/test_memory_passes.py @@ -14,13 +14,23 @@ import executorch.backends.cadence.aot.ops_registrations # noqa import torch from executorch.backends.cadence.aot import compiler -from executorch.backends.cadence.aot.memory_planning import find_peak_memory_usage +from executorch.backends.cadence.aot.graph_builder import GraphBuilder +from executorch.backends.cadence.aot.memory_planning import ( + CadenceMemoryPlanning, + find_peak_memory_usage, +) from executorch.backends.cadence.aot.pass_utils import count_node -from executorch.backends.cadence.aot.utils import MemoryConfig +from executorch.backends.cadence.aot.utils import ( + get_default_memory_config, + MemoryConfig, +) from executorch.exir import memory from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.memory_planning import collect_specs_from_nodes +from executorch.exir.passes.spec_prop_pass import SpecPropPass from executorch.exir.tests.models import MultiLayerPerceptron +from parameterized.parameterized import parameterized +from torch.fx import GraphModule class TestMemPlanningPasses(unittest.TestCase): @@ -120,24 +130,27 @@ def forward(self, x): class TestMemTransform(unittest.TestCase): def _verify_cat_nop_memory_alloc(self, node: torch.fx.Node) -> None: - spec = node.meta.get("spec", None) - self.assertIsNotNone(spec) - dim: int = cast(int, node.args[1]) if len(node.args) > 1 else 0 - outer_size = math.prod(spec.shape[:dim]) + node_spec = node.meta.get("spec", None) + self.assertIsNotNone(node_spec) + dim: int = cast(int, node.kwargs["dim"]) if "dim" in node.kwargs else 0 + outer_size = math.prod(node_spec.shape[:dim]) self.assertEqual( outer_size, 1, f"{node=} has wrong outer size: {outer_size=}, expected 1.", ) - inner_dim_elements = math.prod(spec.shape[dim + 1 :]) * spec.dtype.itemsize + inner_dim_elements = ( + math.prod(node_spec.shape[dim + 1 :]) * node_spec.dtype.itemsize + ) dim_offset = 0 for arg in cast(list[torch.fx.Node], node.args[0]): arg_spec = arg.meta.get("spec", None) - self.assertEqual(arg_spec.mem_id, spec.mem_id) + self.assertEqual(arg_spec.mem_id, node_spec.mem_id) + actual_offset = node_spec.mem_offset + dim_offset * inner_dim_elements self.assertEqual( arg_spec.mem_offset, - spec.mem_offset + dim_offset * inner_dim_elements, - f"{arg=} for node {node=} has wrong memory offset: {arg_spec.mem_offset=} {dim_offset=} for cat on {dim=}, but output has {spec.mem_offset=}", + actual_offset, + f"{arg=} of node {node=} has wrong memory offset: expected {arg_spec.mem_offset=}, but got {actual_offset=} = {node_spec.mem_offset=} + {dim_offset=} * {inner_dim_elements=}", ) dim_offset += arg_spec.shape[dim] @@ -209,23 +222,45 @@ def verify_nop_memory_alloc(self, graph_module: torch.fx.GraphModule) -> None: ): self._verify_select_nop_memory_alloc(node) - def test_optimize_cat_on_placeholders(self) -> None: - class Cat(torch.nn.Module): - def forward(self, x, y): - return torch.ops.aten.cat((x, y)) - - x = torch.ones(3, 6) - y = torch.ones(2, 6) - # Optimizing cat ops is only at opt_level 2+, and requires the memory planning - # pass to run: - graph_module = ( - compiler.export_to_executorch_gen_etrecord( - Cat(), (x, y), opt_level=2, mem_algo=1 - ) - .exported_program() - .graph_module - ) - logging.info(f"graph_module: {graph_module.print_readable(print_output=False)}") + # Initializes the nodes metadata and runs the GenerateMemoryViewConstraints, + # GenerateSliceAndSelectNopConstraints, and GenerateCatNopConstraints passes. + def run_memory_planning(self, original, alloc_graph_input=True) -> GraphModule: + graph_module = SpecPropPass().call(original).graph_module + return CadenceMemoryPlanning( + get_default_memory_config(), + opt_level=2, + mem_algo=1, # greedy_by_size_for_offset_calculation_with_hierarchy + alloc_graph_input=alloc_graph_input, + )(graph_module).graph_module + + @parameterized.expand( + [ + [ + [3, 6], # x_shape + [2, 6], # y_shape + 0, # concat dim + ], + ] + ) + def test_optimize_cat_on_placeholders(self, x_shape, y_shape, concat_dim) -> None: + concat_shape = [x_shape[concat_dim] + y_shape[concat_dim], x_shape[1]] + builder = GraphBuilder() + x = builder.placeholder("x", torch.ones(*x_shape)) + y = builder.placeholder("y", torch.ones(*y_shape)) + pre_created_output = builder.call_operator( + op=exir_ops.edge.aten.full.default, + args=(concat_shape, 0.0), + kwargs={"dtype": torch.float32}, + ) + graph_output = builder.call_operator( + op=torch.ops.aten.cat.out, + args=([x, y],), + kwargs={"dim": concat_dim, "out": pre_created_output}, + ) + builder.output([graph_output]) + original = builder.get_graph_module() + + graph_module = self.run_memory_planning(original) graph_module.graph.eliminate_dead_code() # Assert that cat op is optimized away self.assertEqual(count_node(graph_module, torch.ops.aten.cat.out), 0) @@ -233,53 +268,88 @@ def forward(self, x, y): self.assertEqual(count_node(graph_module, torch.ops.aten._cat_nop.out), 1) self.verify_nop_memory_alloc(graph_module) - def test_optimize_cat_outermost(self) -> None: - class OptimizeCatFeasible1(torch.nn.Module): - def forward(self, x, y): - x1 = torch.add(x, 2.4, 3.1) - y1 = torch.add(y, 1, 2) - # Cat along the outermost dimension can be optimized away after - # adding constraints on the locations of x1 and y1. - return torch.ops.aten.cat((x1, y1)) - - x = torch.ones(3, 6) - y = torch.ones(2, 6) - # Optimizing cat ops is only at opt_level 2+, and requires the memory planning - # pass to run: - graph_module = ( - compiler.export_to_executorch_gen_etrecord( - OptimizeCatFeasible1(), (x, y), opt_level=2, mem_algo=1 + # Returns a GraphModule with the following structure: + # "add_add_cat_model" : cat(x + 123, y + 456) + # "add_add_cat_add_model": cat(x + 123, y + 456) + 789 + def get_graph_module( + self, model_name, x_shape, y_shape, concated_shape, concat_dim + ) -> GraphModule: + builder = GraphBuilder() + x = builder.placeholder("x", torch.ones(*x_shape, dtype=torch.float32)) + y = builder.placeholder("y", torch.ones(*y_shape, dtype=torch.float32)) + to_add_to_x = builder.call_operator( + op=exir_ops.edge.aten.full.default, + args=(x_shape, 123.0), + kwargs={"dtype": torch.float32}, + ) + add_x = builder.call_operator( + op=exir_ops.edge.aten.add.Tensor, + args=(x, to_add_to_x), + ) + to_add_to_y = builder.call_operator( + op=exir_ops.edge.aten.full.default, + args=(y_shape, 456.0), + kwargs={"dtype": torch.float32}, + ) + add_y = builder.call_operator( + op=exir_ops.edge.aten.add.Tensor, + args=(y, to_add_to_y), + ) + pre_created_output = builder.call_operator( + op=exir_ops.edge.aten.full.default, + args=(concated_shape, 0.0), + kwargs={"dtype": torch.float32}, + ) + cat = builder.call_operator( + op=torch.ops.aten.cat.out, + args=([add_x, add_y],), + kwargs={"dim": concat_dim, "out": pre_created_output}, + ) + if model_name == "add_add_cat_model": + builder.output([cat]) + return builder.get_graph_module() + + if model_name == "add_add_cat_add_model": + to_add_to_cat = builder.call_operator( + op=exir_ops.edge.aten.full.default, + args=(concated_shape, 789.0), + kwargs={"dtype": torch.float32}, ) - .exported_program() - .graph_module - ) - graph_module.graph.eliminate_dead_code() - # Assert that cat op is optimized away - self.assertEqual(count_node(graph_module, torch.ops.aten.cat.out), 0) - # Assert that cat op is replaced by its nop version post optimization - self.assertEqual(count_node(graph_module, torch.ops.aten._cat_nop.out), 1) - self.verify_nop_memory_alloc(graph_module) - - def test_optimize_cat_non_outermost(self) -> None: - class OptimizeCatFeasible2(torch.nn.Module): - def forward(self, x, y): - x1 = torch.add(x, 2.4, 3.1) - y1 = torch.add(y, 1, 2) - # Cat along the outermost dimension can be optimized away after - # adding constraints on the locations of x1 and y1. - return torch.ops.aten.cat((x1, y1), 1) - - x = torch.ones(1, 3, 6) - y = torch.ones(1, 2, 6) - # Optimizing cat ops is only at opt_level 2+, and requires the memory planning - # pass to run: - graph_module = ( - compiler.export_to_executorch_gen_etrecord( - OptimizeCatFeasible2(), (x, y), opt_level=2, mem_algo=1 + graph_output = builder.call_operator( + op=exir_ops.edge.aten.add.Tensor, + args=(cat, to_add_to_cat), ) - .exported_program() - .graph_module - ) + builder.output([graph_output]) + return builder.get_graph_module() + + raise ValueError(f"Unknown model name {model_name}") + + @parameterized.expand( + [ + ( + "outermost", + [3, 6], # x_shape + [2, 6], # y_shape + [5, 6], # concated_shape + 0, # concat dim + ), + ( + "non_outermost", + [1, 3, 6], # x_shape + [1, 2, 6], # y_shape + [1, 5, 6], # concated_shape + 1, # concat dim + ), + ], + name_func=lambda f, _, param: f"{f.__name__}_{param.args[0]}", + ) + def test_cat_optimized( + self, _, x_shape, y_shape, concated_shape, concat_dim + ) -> None: + original = self.get_graph_module( + "add_add_cat_model", x_shape, y_shape, concated_shape, concat_dim + ) + graph_module = self.run_memory_planning(original) graph_module.graph.eliminate_dead_code() # Assert that cat op is optimized away self.assertEqual(count_node(graph_module, torch.ops.aten.cat.out), 0) @@ -287,111 +357,181 @@ def forward(self, x, y): self.assertEqual(count_node(graph_module, torch.ops.aten._cat_nop.out), 1) self.verify_nop_memory_alloc(graph_module) - def test_no_optimize_cat_non_outermost(self) -> None: - class OptimizeCatInfeasible1(torch.nn.Module): - def forward(self, x, y): - x1 = torch.add(x, 2.4, 3.1) - y1 = torch.add(y, 1, 2) - # Cat along the outermost dimension can be optimized away after - # adding constraints on the locations of x1 and y1. - return torch.ops.aten.cat((x1, y1), 1) - - x = torch.ones(2, 4, 5) - y = torch.ones(2, 2, 5) - # Optimizing cat ops is only at opt_level 2+, and requires the memory planning - # pass to run - graph_module = ( - compiler.export_to_executorch_gen_etrecord( - OptimizeCatInfeasible1(), (x, y), opt_level=2, mem_algo=1 - ) - .exported_program() - .graph_module - ) + @parameterized.expand( + [ + ( + "non_outermost", + [2, 4, 5], # x_shape + [2, 2, 5], # y_shape + [2, 6, 5], # concated_shape + 1, # concat dim + ), + ], + name_func=lambda f, _, param: f"{f.__name__}_{param.args[0]}", + ) + def test_cat_not_optimized( + self, _, x_shape, y_shape, concated_shape, concat_dim + ) -> None: + original = self.get_graph_module( + "add_add_cat_model", x_shape, y_shape, concated_shape, concat_dim + ) + graph_module = self.run_memory_planning(original) graph_module.graph.eliminate_dead_code() - # Assert that cat op is not optimized away, since the concat is not - # along the outermost dim + # Assert that cat op is not optimized away, since the concat is not along the outermost dim. + # The first dimension is 2, but all dims before cat_dim should be == 1. self.assertEqual(count_node(graph_module, torch.ops.aten.cat.out), 1) self.verify_nop_memory_alloc(graph_module) - def test_no_optimize_cat_non_outermost1(self) -> None: - class OptimizeCatInfeasible2(torch.nn.Module): - def forward(self, x, y): - x1 = torch.add(x, 2.4, 3.1) - y1 = torch.add(y, 1, 2) - # Cat along the outermost dimension can be optimized away after - # adding constraints on the locations of x1 and y1. - return torch.ops.aten.cat((x1, y1), 0) + 2 + @parameterized.expand( + [ + ( + "aligned", + [5, 8], # x_shape + [3, 8], # y_shape + [8, 8], # concated_shape + 0, # concat dim + 0, # expected cat nodes + ), + ( + "unaligned", # 5 * 5 * 4 % 8 != 0 + [5, 5], # x_shape + [3, 5], # y_shape + [8, 5], # concated_shape + 0, # concat dim + 1, # expected cat nodes + ), + ], + name_func=lambda f, _, param: f"{f.__name__}_{param.args[0]}", + ) + def test_cat_not_graph_output( + self, _, x_shape, y_shape, concated_shape, concat_dim, expected_cat_nodes + ) -> None: + original = self.get_graph_module( + "add_add_cat_add_model", x_shape, y_shape, concated_shape, concat_dim + ) + graph_module = self.run_memory_planning(original) + graph_module.graph.eliminate_dead_code() - x = torch.ones(5, 5) - y = torch.ones(3, 5) - # Optimizing cat ops is only at opt_level 2+, and requires the memory planning - # pass to run: - graph_module = ( - compiler.export_to_executorch_gen_etrecord( - OptimizeCatInfeasible2(), (x, y), opt_level=2, mem_algo=1 - ) - .exported_program() - .graph_module + # Assert that cat op is optimized away only if its arguments offsets are multiple of 8 bytes. + self.assertEqual( + count_node(graph_module, torch.ops.aten.cat.out), expected_cat_nodes ) - graph_module.graph.eliminate_dead_code() - # Assert that cat op is not optimized away, since the concat relative - # offsets are not multiple of 8 bytes, and the cat is not the output - # of the graph. - self.assertEqual(count_node(graph_module, torch.ops.aten.cat.out), 1) self.verify_nop_memory_alloc(graph_module) def test_optimize_cat_with_slice(self) -> None: - class OptimizeCatSliceFeasible(torch.nn.Module): - def forward(self, x): - x1 = torch.add(x, 2.4, 3.1) - x2 = torch.ops.aten.slice(x, 0, 0, 1) - x3 = torch.ops.aten.cat((x1, x2)) - return torch.add(x3, x3) - - x = torch.randn(5, 6) - # Compile, and set alloc_graph_input to False so that slice op is not - # optimized away. - # Optimizing cat ops is only at opt_level 2+, and requires the memory planning - # pass to run: - graph_module = ( - compiler.export_to_executorch_gen_etrecord( - OptimizeCatSliceFeasible(), - (x,), - opt_level=2, - mem_algo=1, - alloc_graph_input=False, - ) - .exported_program() - .graph_module - ) + x_shape = [5, 6] + concated_shape = [6, 6] + concat_dim = 0 + builder = GraphBuilder() + x = builder.placeholder("x", torch.ones(*x_shape, dtype=torch.float32)) + to_add_to_x = builder.call_operator( + op=exir_ops.edge.aten.full.default, + args=(x_shape, 123.0), + kwargs={"dtype": torch.float32}, + ) + add_x = builder.call_operator( + op=exir_ops.edge.aten.add.Tensor, + args=(x, to_add_to_x), + ) + slice_x = builder.call_operator( + op=exir_ops.edge.aten.slice.Tensor, + args=( + x, + 0, # dim + 0, # start + 1, # end + 1, # step + ), + ) + pre_created_output = builder.call_operator( + op=exir_ops.edge.aten.full.default, + args=(concated_shape, 0.0), + kwargs={"dtype": torch.float32}, + ) + cat = builder.call_operator( + op=torch.ops.aten.cat.out, + args=([add_x, slice_x],), + kwargs={"dim": concat_dim, "out": pre_created_output}, + ) + graph_output = builder.call_operator( + op=exir_ops.edge.aten.add.Tensor, + args=(cat, cat), + ) + builder.output([graph_output]) + original = builder.get_graph_module() + + graph_module = self.run_memory_planning(original, alloc_graph_input=False) graph_module.graph.eliminate_dead_code() - # Assert that cat op is optimized away + + # Assert that cat op is optimized away. + self.assertEqual(count_node(graph_module, torch.ops.aten.cat.out), 0) + # Assert that cat op is replaced by its nop version post optimization. self.assertEqual(count_node(graph_module, torch.ops.aten._cat_nop.out), 1) + # Assert that slice op was not optimized away. + self.assertEqual(count_node(graph_module, exir_ops.edge.aten.slice.Tensor), 1) self.verify_nop_memory_alloc(graph_module) def test_optimize_cat_with_slice_infeasible(self) -> None: - class OptimizeCatSliceInfeasible(torch.nn.Module): - def forward(self, x, y): - x1 = torch.add(x, 2.4, 3.1) - y1 = torch.add(y, 1, 2) - y2 = torch.ops.aten.slice(y1, 0, 0, 1) - # Cat can't be optimized away if any of the tensor (e.g., y1) - # is slice_nop - return torch.ops.aten.cat((y2, x1)) - - x = torch.ones(3, 5) - y = torch.ones(2, 5) - # Optimizing cat ops is only at opt_level 2+, and requires the memory planning - # pass to run: - graph_module = ( - compiler.export_to_executorch_gen_etrecord( - OptimizeCatSliceInfeasible(), (x, y), opt_level=2, mem_algo=1 - ) - .exported_program() - .graph_module - ) + x_shape = [5, 6] + y_shape = [3, 6] + concated_shape = [8, 6] + concat_dim = 0 + builder = GraphBuilder() + x = builder.placeholder("x", torch.ones(*x_shape, dtype=torch.float32)) + y = builder.placeholder("y", torch.ones(*y_shape, dtype=torch.float32)) + to_add_to_x = builder.call_operator( + op=exir_ops.edge.aten.full.default, + args=(x_shape, 123.0), + kwargs={"dtype": torch.float32}, + ) + add_x = builder.call_operator( + op=exir_ops.edge.aten.add.Tensor, + args=(x, to_add_to_x), + ) + to_add_to_y = builder.call_operator( + op=exir_ops.edge.aten.full.default, + args=(y_shape, 123.0), + kwargs={"dtype": torch.float32}, + ) + add_y = builder.call_operator( + op=exir_ops.edge.aten.add.Tensor, + args=(y, to_add_to_y), + ) + slice_out = builder.call_operator( + op=exir_ops.edge.aten.full.default, + args=(y_shape, 0.0), + kwargs={"dtype": torch.float32}, + ) + slice_y = builder.call_operator( + op=torch.ops.aten.slice_copy.Tensor_out, + args=( + add_y, + 0, # dim + 0, # start + 1, # end + 1, # step + ), + kwargs={"out": slice_out}, + ) + pre_created_output = builder.call_operator( + op=exir_ops.edge.aten.full.default, + args=(concated_shape, 0.0), + kwargs={"dtype": torch.float32}, + ) + cat = builder.call_operator( + op=torch.ops.aten.cat.out, + args=([slice_y, add_x],), + kwargs={"dim": concat_dim, "out": pre_created_output}, + ) + builder.output([cat]) + original = builder.get_graph_module() + graph_module = self.run_memory_planning(original, alloc_graph_input=False) graph_module.graph.eliminate_dead_code() - # Assert that cat op is not optimized away + # # Assert that slice op is optimized away. + self.assertEqual( + count_node(graph_module, torch.ops.aten._slice_copy_nop.Tensor_out), 1 + ) + # # Assert that cat op is not optimized away self.assertEqual(count_node(graph_module, torch.ops.aten.cat.out), 1) self.verify_nop_memory_alloc(graph_module) diff --git a/backends/cadence/aot/tests/test_replace_ops_passes.py b/backends/cadence/aot/tests/test_replace_ops_passes.py index e7bf8e9cefa..e8215c378f9 100644 --- a/backends/cadence/aot/tests/test_replace_ops_passes.py +++ b/backends/cadence/aot/tests/test_replace_ops_passes.py @@ -26,13 +26,13 @@ ForceChannelLastForConvPass, MakeSliceAndCatDimOutermostPass, ReplaceAddMMWithLinearPass, + ReplaceAtenApproxGeluWithApproxGeluPass, ReplaceAtenConvolutionWithJarvisConvolutionPass, ReplaceConstantPadNdWithSlicePass, ReplaceConvolutionOptionalArgsWithConcreteArgsPass, ReplaceConvWithIm2RowAndLinear, ReplaceEmptyTensorsWithFullPass, ReplaceFunctionallyEquivalentOpTargets, - ReplaceGeluWithApproximateGeluPass, ReplaceIm2RowWithViewPass, ReplaceLinearWithFullyConnectedOpPass, ReplaceMatmulWithTransposedMatmulPass, @@ -1287,17 +1287,41 @@ def forward(self, cond: torch.Tensor): 1, ) - def test_replace_aten_gelu_with_approximate_gelu(self): - class Gelu(torch.nn.Module): - def forward(self, input): - return torch.nn.functional.gelu(input) + def test_no_replace_aten_gelu_with_approximate_gelu(self): + inputs = torch.randn(2, 1, 64) + + gm = single_op_builder( + placeholders=(inputs,), + op=exir_ops.edge.aten.gelu.default, + args=(inputs,), + ) + gm = ExportPass().call(gm).graph_module + + p = ReplaceAtenApproxGeluWithApproxGeluPass() + graph_after_passes = p.call(gm).graph_module + # Assert that aten.gelu op was not decomposed, since it didn't have an approximate argument + self.assertEqual( + count_node( + graph_after_passes, + exir_ops.edge.aten.gelu.default, + ), + 1, + ) + + def test_replace_aten_approximate_gelu_with_approximate_gelu(self): inputs = torch.randn(2, 1, 64) - graph_module = export_to_edge(Gelu(), (inputs,)).exported_program().graph_module + gm = single_op_builder( + placeholders=(inputs,), + op=exir_ops.edge.aten.gelu.default, + args=(inputs,), + kwargs={"approximate": "tanh"}, + ) + gm = ExportPass().call(gm).graph_module - p = ReplaceGeluWithApproximateGeluPass() - graph_after_passes = cast(PassResult, p(graph_module)).graph_module + p = ReplaceAtenApproxGeluWithApproxGeluPass() + graph_after_passes = p.call(gm).graph_module # Assert that aten.gelu op was decomposed self.assertEqual( diff --git a/backends/cadence/utils/facto_util.py b/backends/cadence/utils/facto_util.py index 8cd57059244..b896f8a8e89 100644 --- a/backends/cadence/utils/facto_util.py +++ b/backends/cadence/utils/facto_util.py @@ -20,8 +20,8 @@ MAX_CASES = 50 -def apply_tensor_contraints(op_name: str, tensor_constraints: list[object]) -> None: - additional_tensor_constraints = [ +def apply_tensor_contraints(op_name: str, index: int) -> list[object]: + tensor_constraints = [ cp.Dtype.In(lambda deps: [torch.int, torch.float]), cp.Dtype.NotIn(lambda deps: [torch.int64, torch.float64]), cp.Value.Ge(lambda deps, dtype, struct: -(2**4)), @@ -33,17 +33,28 @@ def apply_tensor_contraints(op_name: str, tensor_constraints: list[object]) -> N match op_name: case "where.self": - additional_tensor_constraints = [ - cp.Dtype.In(lambda deps: [torch.float, torch.int, torch.bool]), - cp.Dtype.NotIn(lambda deps: [torch.int64, torch.float64]), - cp.Value.Ge(lambda deps, dtype, struct: -(2**4)), - cp.Value.Le(lambda deps, dtype, struct: 2**4), - cp.Rank.Ge(lambda deps: 1), - cp.Size.Ge(lambda deps, r, d: 1), - cp.Size.Le(lambda deps, r, d: 2**9), - ] + if index == 0: # condition + tensor_constraints = [ + cp.Dtype.In(lambda deps: [torch.bool]), + cp.Dtype.NotIn(lambda deps: [torch.int64, torch.float64]), + cp.Value.Ge(lambda deps, dtype, struct: -(2**4)), + cp.Value.Le(lambda deps, dtype, struct: 2**4), + cp.Rank.Ge(lambda deps: 1), + cp.Size.Ge(lambda deps, r, d: 1), + cp.Size.Le(lambda deps, r, d: 2**9), + ] + else: + tensor_constraints = [ + cp.Dtype.In(lambda deps: [torch.float, torch.int]), + cp.Dtype.NotIn(lambda deps: [torch.int64, torch.float64]), + cp.Value.Ge(lambda deps, dtype, struct: -(2**4)), + cp.Value.Le(lambda deps, dtype, struct: 2**4), + cp.Rank.Ge(lambda deps: 1), + cp.Size.Ge(lambda deps, r, d: 1), + cp.Size.Le(lambda deps, r, d: 2**9), + ] case "sigmoid.default": - additional_tensor_constraints.extend( + tensor_constraints.extend( [ cp.Dtype.In(lambda deps: [torch.float]), cp.Rank.Le(lambda deps: 2**2), @@ -52,7 +63,7 @@ def apply_tensor_contraints(op_name: str, tensor_constraints: list[object]) -> N ] ) case "rsqrt.default": - additional_tensor_constraints.extend( + tensor_constraints.extend( [ cp.Dtype.In(lambda deps: [torch.float]), cp.Rank.Le(lambda deps: 2**2), @@ -63,14 +74,14 @@ def apply_tensor_contraints(op_name: str, tensor_constraints: list[object]) -> N ] ) case "mean.dim": - additional_tensor_constraints.extend( + tensor_constraints.extend( [ cp.Dtype.In(lambda deps: [torch.float]), cp.Rank.Le(lambda deps: 2**2), ] ) case "exp.default": - additional_tensor_constraints.extend( + tensor_constraints.extend( [ cp.Rank.Le(lambda deps: 2**3), cp.Value.Ge(lambda deps, dtype, struct: -(2**2)), @@ -78,7 +89,7 @@ def apply_tensor_contraints(op_name: str, tensor_constraints: list[object]) -> N ] ) case "slice_copy.Tensor": - additional_tensor_constraints.extend( + tensor_constraints.extend( [ cp.Rank.Le(lambda deps: 2), cp.Value.Ge(lambda deps, dtype, struct: 1), @@ -86,12 +97,12 @@ def apply_tensor_contraints(op_name: str, tensor_constraints: list[object]) -> N ] ) case _: - additional_tensor_constraints.extend( + tensor_constraints.extend( [ cp.Rank.Le(lambda deps: 2**2), ] ) - tensor_constraints.extend(additional_tensor_constraints) + return tensor_constraints def apply_scalar_contraints(op_name: str) -> list[ScalarDtype]: @@ -107,9 +118,6 @@ def apply_scalar_contraints(op_name: str) -> list[ScalarDtype]: def facto_testcase_gen(op_name: str) -> List[Tuple[List[str], OrderedDict[str, str]]]: # minimal example to test add.Tensor using FACTO spec = SpecDictDB[op_name] - tensor_constraints = [] - # common tensor constraints - apply_tensor_contraints(op_name, tensor_constraints) for index, in_spec in enumerate(copy.deepcopy(spec.inspec)): if in_spec.type.is_scalar(): @@ -142,7 +150,9 @@ def facto_testcase_gen(op_name: str) -> List[Tuple[List[str], OrderedDict[str, s ] ) elif in_spec.type.is_tensor(): - spec.inspec[index].constraints.extend(tensor_constraints) + spec.inspec[index].constraints.extend( + apply_tensor_contraints(op_name, index) + ) elif in_spec.type.is_dim_list(): spec.inspec[index].constraints.extend( [ diff --git a/backends/qualcomm/builders/op_avg_pool2d.py b/backends/qualcomm/builders/op_avg_pool2d.py index f4762e8bb5a..6892e7326f6 100644 --- a/backends/qualcomm/builders/op_avg_pool2d.py +++ b/backends/qualcomm/builders/op_avg_pool2d.py @@ -23,6 +23,12 @@ class AvgPool2d(NodeVisitor): def __init__(self, *args) -> None: super().__init__(*args) + def _get_filter_size(self, node): + filter_size = cast(List[int], node.args[1]) + if len(filter_size) == 1: + filter_size = filter_size + filter_size + return filter_size + def define_node( self, node: torch.fx.Node, @@ -46,31 +52,44 @@ def define_node( PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, nodes_to_wrappers, ) + + pt_ceil_mode = node.args[4] if len(node.args) > 4 else False + # kernel info - filter_size = cast(List[int], node.args[1]) - if len(filter_size) == 1: - filter_size = filter_size + filter_size + input_shape = input_node.meta["val"].shape + input_h, input_w = input_shape[2], input_shape[3] + filter_size = self._get_filter_size(node) + if pt_ceil_mode: + # filter_size might larger than input_h, input_w, use min of them + filter_size = [min(filter_size[0], input_h), min(filter_size[1], input_w)] filter_size_shape = [len(filter_size)] - # stride info - default to kernel_size if not given - stride = cast(List[int], node.args[2]) if len(node.args) > 2 else filter_size - if len(stride) == 1: - stride = stride + stride - stride_shape = [len(stride)] - padding = [0, 0] if len(node.args) > 3: padding = cast(List[int], node.args[3]) if len(padding) == 1: padding = padding + padding + if pt_ceil_mode: + ori_filter_h, ori_filter_w = self._get_filter_size(node) + padding = [ + 0 if ori_filter_h > input_h else padding[0], + 0 if ori_filter_w > input_w else padding[1], + ] + padding_shape = [len(padding), len(padding)] # if ceil mode is True, use ceil instead of floor to compute the output shape - mode = OpPoolAvg2d.RoundingMode.FLOOR - if len(node.args) > 4: - ceil_mode = cast(bool, node.args[4]) - if ceil_mode: - mode = OpPoolAvg2d.RoundingMode.CEIL + mode = ( + OpPoolAvg2d.RoundingMode.CEIL + if pt_ceil_mode + else OpPoolAvg2d.RoundingMode.FLOOR + ) + + # stride info - default to kernel_size if not given + stride = cast(List[int], node.args[2]) if len(node.args) > 2 else filter_size + if len(stride) == 1: + stride = stride + stride + stride_shape = [len(stride)] count_include_pad = True if len(node.args) > 5: diff --git a/backends/qualcomm/quantizer/annotators.py b/backends/qualcomm/quantizer/annotators.py index 730bdaf47d0..5195cf39f33 100644 --- a/backends/qualcomm/quantizer/annotators.py +++ b/backends/qualcomm/quantizer/annotators.py @@ -967,6 +967,7 @@ def annotate_cdist(node: Node, quantization_config: QuantizationConfig) -> None: @register_annotator( [ torch.ops.aten.conv2d.default, + torch.ops.aten.conv2d.padding, torch.ops.aten.conv1d.default, torch.ops.aten.conv_transpose2d.input, torch.ops.aten.conv_transpose1d.default, diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py index 23f9e8fd79c..053bef79d1f 100644 --- a/backends/qualcomm/tests/models.py +++ b/backends/qualcomm/tests/models.py @@ -147,12 +147,13 @@ def forward(self, x, y): class AvgPoolModule(torch.nn.Module): - def __init__(self): + def __init__(self, kernel_size, stride, padding, ceil_mode): super().__init__() self.avgPool = torch.nn.AvgPool2d( - kernel_size=(2, 2), - padding=(1, 1), - stride=(1, 1), + kernel_size=kernel_size, + stride=stride, + padding=padding, + ceil_mode=ceil_mode, count_include_pad=False, ) @@ -1268,6 +1269,14 @@ def forward(self, x): return x.repeat(1, 2, 3, 4) +class ReWriteObs(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + return torch.nn.functional.relu(x).expand(3, 4) + + class Reshape(torch.nn.Module): def __init__(self): super().__init__() diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 3616329d32a..031b7244a64 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -49,6 +49,7 @@ generate_qnn_executorch_compiler_spec, PyQnnManagerAdaptor, QnnPartitioner, + rewrite_prepared_observer, skip_annotation, to_edge_transform_and_lower_to_qnn, update_spill_fill_size, @@ -163,9 +164,19 @@ def test_qnn_backend_argmin(self): self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_avg_pool2d(self): - module = AvgPoolModule() # noqa: F405 - sample_input = (torch.randn(1, 3, 2, 2),) - self.lower_module_and_test_output(module, sample_input) + modules = [ + AvgPoolModule((2, 2), (1, 1), (1, 1), False), # noqa: F405 + AvgPoolModule((1280, 1280), (1280, 1280), (0, 0), True), # noqa: F405 + AvgPoolModule((1280, 1280), (1280, 1280), (320, 320), True), # noqa: F405 + ] # noqa: F405 + sample_inputs = [ + (torch.randn(1, 3, 2, 2),), + (torch.randn(1, 1280, 7, 7),), + (torch.randn(1, 1280, 7, 7),), + ] + for i, module in enumerate(modules): + with self.subTest(i=i): + self.lower_module_and_test_output(module, sample_inputs[i]) def test_qnn_backend_batch_norm(self): modules = [BatchNorm(32), BatchNorm(32, False)] # noqa: F405 @@ -1271,10 +1282,20 @@ def test_qnn_backend_argmin(self): self.lower_module_and_test_output(module, sample_input) def test_qnn_backend_avg_pool2d(self): - module = AvgPoolModule() # noqa: F405 - sample_input = (torch.randn(1, 3, 2, 2),) - module = self.get_qdq_module(module, sample_input) - self.lower_module_and_test_output(module, sample_input) + modules = [ + AvgPoolModule((2, 2), (1, 1), (1, 1), False), # noqa: F405 + AvgPoolModule((1280, 1280), (1280, 1280), (0, 0), True), # noqa: F405 + AvgPoolModule((1280, 1280), (1280, 1280), (320, 320), True), # noqa: F405 + ] # noqa: F405 + sample_inputs = [ + (torch.randn(1, 3, 2, 2),), + (torch.randn(1, 1280, 7, 7),), + (torch.randn(1, 1280, 7, 7),), + ] + for i, module in enumerate(modules): + with self.subTest(i=i): + module = self.get_qdq_module(module, sample_inputs[i]) + self.lower_module_and_test_output(module, sample_inputs[i]) def test_qnn_backend_batch_norm(self): modules = [BatchNorm(32), BatchNorm(32, False)] # noqa: F405 @@ -3038,6 +3059,36 @@ def test_qnn_backend_dynamic_shape(self): check_io_shape=True, ) + def test_qnn_backend_rewrite_prepared_observer(self): + from torchao.quantization.pt2e import FixedQParamsObserver + + module = ReWriteObs() # noqa: F405 + sample_input = (torch.randn([3, 1]),) + module = torch.export.export(module, sample_input, strict=True).module() + + quantizer = make_quantizer() + + prepared = prepare_pt2e(module, quantizer) + prepared(*sample_input) + + new_obs = FixedQParamsObserver( + scale=0.004, + zero_point=0, + dtype=torch.uint8, + quant_min=0, + quant_max=255, + qscheme=torch.per_tensor_affine, + ) + + rewrite_prepared_observer(prepared, {"activation_post_process_2": new_obs}) + self.assertTrue( + prepared.activation_post_process_1 + == prepared.activation_post_process_2 + == new_obs + ) + quantized_module = convert_pt2e(prepared) + self.lower_module_and_test_output(quantized_module, sample_input) + def test_qnn_backend_skip_node_id_partitioner(self): module = SimpleModel() # noqa: F405 sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) @@ -3829,6 +3880,41 @@ def test_conv_former(self): self.assertGreaterEqual(msg["top_1"], 60) self.assertGreaterEqual(msg["top_5"], 80) + def test_deit(self): + if not self.required_envs([self.image_dataset]): + self.skipTest("missing required envs") + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/deit.py", + "--dataset", + self.image_dataset, + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + "--device", + self.device, + "--model", + self.model, + "--ip", + self.ip, + "--port", + str(self.port), + ] + if self.host: + cmds.extend(["--host", self.host]) + + p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["top_1"], 75) + self.assertGreaterEqual(msg["top_5"], 90) + def test_dino_v2(self): if not self.required_envs([self.image_dataset]): self.skipTest("missing required envs") @@ -3864,6 +3950,41 @@ def test_dino_v2(self): self.assertGreaterEqual(msg["top_1"], 70) self.assertGreaterEqual(msg["top_5"], 85) + def test_efficientnet(self): + if not self.required_envs([self.image_dataset]): + self.skipTest("missing required envs") + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/efficientnet.py" + "--dataset", + self.image_dataset, + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + "--device", + self.device, + "--model", + self.model, + "--ip", + self.ip, + "--port", + str(self.port), + ] + if self.host: + cmds.extend(["--host", self.host]) + + p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["top_1"], 70) + self.assertGreaterEqual(msg["top_5"], 85) + def test_efficientSAM(self): if not self.required_envs( [self.image_dataset, self.pretrained_weight, self.oss_repo] diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py index 6dcad7ba00b..2d53f4dc71c 100644 --- a/backends/qualcomm/utils/utils.py +++ b/backends/qualcomm/utils/utils.py @@ -5,7 +5,7 @@ # LICENSE file in the root directory of this source tree. import operator import warnings -from collections import OrderedDict +from collections import defaultdict, OrderedDict from typing import Any, Callable, Dict, List, Optional, Tuple, Union import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManagerAdaptor @@ -1038,3 +1038,53 @@ def tag_quant_io(gm: torch.fx.GraphModule, get_quant_io_dtype_fn: Callable): for node in gm.graph.nodes: if dtype := get_quant_io_dtype_fn(node): node.meta[QCOM_QUANTIZED_IO] = dtype + + +def rewrite_prepared_observer( + graph_module: torch.fx.GraphModule, name_obs_dict: Dict[str, torch.nn.Module] +): + """ + Rewrite the observer of the specified observer module name in the graph_module. + + Example: + Consider the following graph_module after prepare_pt2e: + gm = prepare_pt2e(gm) + print(gm) + + GraphModule( + (activation_post_process_0): MinMaxObserver(min_val=inf, max_val=-inf) + (activation_post_process_1): MinMaxObserver(min_val=inf, max_val=-inf) + (activation_post_process_2): MinMaxObserver(min_val=inf, max_val=-inf) + (activation_post_process_3): MinMaxObserver(min_val=inf, max_val=-inf) + ) + + new_observer = observer.FixedQParamsObserver( + scale=0.125, + zero_point=42, + dtype=torch.uint8, + quant_min=0, + quant_max=255, + qscheme=torch.per_tensor_affine, + ) + + Calling rewrite_prepared_observer(gm, {"activation_post_process_0": new_observer}) + is equivalent to: + gm.activation_post_process_0 = new_observer + + Note: + If the rewritten observer is a SharedQuantizationSpec, all other shared observers will also be rewritten. + """ + module_name_list = defaultdict(list) + for name, module in graph_module.named_modules(remove_duplicate=False): + module_name_list[module].append(name) + + for name, new_observer in name_obs_dict.items(): + old_module = getattr(graph_module, name, None) + + if not old_module: + print( + f"[WARNING], No observer named as {name} found, please check the moudle name" + ) + continue + for target_name in module_name_list[old_module]: + setattr(graph_module, target_name, new_observer) diff --git a/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py b/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py index b4337829d7f..c415249383e 100644 --- a/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py +++ b/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py @@ -32,7 +32,13 @@ def should_squeeze(self, op, shape: List[int]) -> bool: # pyre-ignore return shape[1] == 1 and shape[0] > 1 if len(shape) == 4: # No need to squeeze if all dims are 1 except the width dim - if all(dim == 1 for dim in shape[:-1]): + if shape[0] == shape[1] == shape[2] == 1: + return False + # No need to squeeze if batch and channel dims are 1 and height and width are > 1 + if shape[0] == shape[1] == 1 and shape[2] > 1 and shape[3] > 1: + return False + # No need to squeeze if batch dim is 1 and channel, height and width are > 1 + if shape[0] == 1 and shape[1] > 1 and shape[2] > 1 and shape[3] > 1: return False # Otherwise, check for squeezable dim return 1 in shape[:-1] diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp index b32f4eb4308..02df85c33e8 100644 --- a/backends/vulkan/runtime/VulkanBackend.cpp +++ b/backends/vulkan/runtime/VulkanBackend.cpp @@ -499,6 +499,8 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface { compute_graph->encode_prepack(); compute_graph->prepack(); + // TODO(ssjia): remove this once we can batch compile compute pipelines + // during prepare(). compute_graph->encode_execute(); return Error::Ok; @@ -567,9 +569,14 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface { } } + // propagate_resize() will re-encode the command buffer so that push + // constants are updated and DynamicDispatchNode can update the compute + // shader, global workgroup size, and local workgroup size to perform the + // model inference. if (should_propagate_resize) { compute_graph->propagate_resize(); } + compute_graph->execute(); for (size_t i = 0; i < compute_graph->outputs().size(); i++) { diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py index 2b15b2b7d0a..5c59f13fc24 100644 --- a/backends/vulkan/runtime/gen_vulkan_spv.py +++ b/backends/vulkan/runtime/gen_vulkan_spv.py @@ -62,6 +62,7 @@ "uint": "uimage3D", "int8": "iimage3D", "uint8": "uimage3D", + "bool": "uimage3D", }, 2: { "float": "image2D", @@ -70,6 +71,7 @@ "uint": "uimage2D", "int8": "iimage2D", "uint8": "uimage2D", + "bool": "uimage2D", }, }, "SAMPLER_T": { @@ -80,6 +82,7 @@ "uint": "usampler3D", "int8": "isampler3D", "uint8": "usampler3D", + "bool": "usampler3D", }, 2: { "float": "sampler2D", @@ -88,6 +91,7 @@ "uint": "usampler2D", "int8": "isampler2D", "uint8": "usampler2D", + "bool": "usampler2D", }, }, "IMAGE_FORMAT": { @@ -97,6 +101,7 @@ "uint": "rgba32ui", "int8": "rgba8i", "uint8": "rgba8ui", + "bool": "rgba8ui", }, } @@ -115,7 +120,8 @@ def buffer_scalar_type(dtype: str) -> str: return "float16_t" elif dtype[-1] == "8": return dtype + "_t" - + elif dtype == "bool": + return "uint8_t" return dtype @@ -135,17 +141,19 @@ def buffer_gvec_type(dtype: str, n: int) -> str: return f"i8vec{n}" elif dtype == "uint8": return f"u8vec{n}" + elif dtype == "bool": + return f"u8vec{n}" raise AssertionError(f"Invalid dtype: {dtype}") def texel_type(dtype: str) -> str: image_format = TYPE_MAPPINGS["IMAGE_FORMAT"][dtype] - if image_format[-1] == "f": + if image_format[-1:] == "f": return "vec4" - elif image_format[-2] == "ui": + elif image_format[-2:] == "ui": return "uvec4" - elif image_format[-1] == "i": + elif image_format[-1:] == "i": return "ivec4" raise AssertionError(f"Invalid image format: {image_format}") @@ -360,7 +368,7 @@ def define_required_extensions(dtypes: Union[str, List[str]]): elif dtype == "int16" or dtype == "uint16": nbit = "16bit" glsl_type = "int16" - elif dtype == "int8" or dtype == "uint8": + elif dtype == "int8" or dtype == "uint8" or dtype == "bool": nbit = "8bit" glsl_type = "int8" diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index a4a6abdd63f..1222a9fc641 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -492,7 +492,7 @@ vkapi::BufferBindInfo ComputeGraph::get_or_create_int_param_buffer( const ValueRef idx) { if (values_.at(idx).isInt()) { const int32_t val = extract_scalar(idx); - create_params_buffer(val); + return create_params_buffer(val); } else if (values_.at(idx).isSymInt()) { SymIntPtr symint = get_symint(idx); return vkapi::BufferBindInfo(symint->gpu_buffer.buffer()); @@ -500,6 +500,16 @@ vkapi::BufferBindInfo ComputeGraph::get_or_create_int_param_buffer( VK_THROW("Cannot create a int param buffer for the given value"); } +vkapi::BufferBindInfo ComputeGraph::get_or_create_int_param_buffer( + const ValueRef idx, + const int32_t default_val) { + if (values_.at(idx).isNone()) { + return create_params_buffer(default_val); + } else { + return get_or_create_int_param_buffer(idx); + } +} + void ComputeGraph::set_symint(const ValueRef idx, const int32_t val) { get_symint(idx)->set(val); } @@ -678,11 +688,12 @@ void ComputeGraph::encode_execute() { } } -void ComputeGraph::execute() const { +void ComputeGraph::execute() { vkapi::VulkanFence fence = context_->fences().get_fence(); context_->submit_cmd_to_gpu(fence.get_submit_handle()); fence.wait(); context_->fences().return_fence(fence); + execute_count_++; } void ComputeGraph::resize_input( @@ -692,10 +703,17 @@ void ComputeGraph::resize_input( get_tensor(io_val.value)->virtual_resize(new_sizes); } +void ComputeGraph::virtual_resize( + const ValueRef idx, + const std::vector& new_sizes) { + get_tensor(idx)->virtual_resize(new_sizes); +} + void ComputeGraph::propagate_resize() { for (std::unique_ptr& node : execute_nodes_) { node->trigger_resize(this); } + encode_execute(); } } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index 9f4bab3ac04..fe546f26477 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -187,6 +187,7 @@ class ComputeGraph final { protected: size_t values_in_use_ = 0; + size_t execute_count_ = 0; public: // @@ -397,6 +398,19 @@ class ComputeGraph final { std::optional extract_optional_scalar(const ValueRef idx) { if (val_is_none(idx)) { return ::std::nullopt; + } else if (val_is_symint(idx)) { + return utils::safe_downcast(read_symint(idx)); + } else { + return extract_scalar(idx); + } + } + + template + T extract_optional_scalar(const ValueRef idx, const T default_val) { + if (val_is_none(idx)) { + return default_val; + } else if (val_is_symint(idx)) { + return utils::safe_downcast(read_symint(idx)); } else { return extract_scalar(idx); } @@ -608,6 +622,10 @@ class ComputeGraph final { */ vkapi::BufferBindInfo get_or_create_int_param_buffer(const ValueRef idx); + vkapi::BufferBindInfo get_or_create_int_param_buffer( + const ValueRef idx, + const int32_t default_value); + void set_symint(const ValueRef idx, const int32_t val); int32_t read_symint(const ValueRef idx); @@ -745,13 +763,16 @@ class ComputeGraph final { // void encode_execute(); - void execute() const; + void execute(); // // Dynamic Shape support // void resize_input(const int64_t idx, const std::vector& new_sizes); + void virtual_resize( + const ValueRef idx, + const std::vector& new_sizes); void propagate_resize(); // @@ -762,6 +783,10 @@ class ComputeGraph final { return context_->adapter_ptr()->supports_int16_shader_types(); } + inline size_t execute_count() const { + return execute_count_; + } + /* * Check whether the GPU supports 8 bit buffers. */ diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.cpp b/backends/vulkan/runtime/graph/ops/DispatchNode.cpp index 51ff0c122b0..a0d3a4c2e5c 100644 --- a/backends/vulkan/runtime/graph/ops/DispatchNode.cpp +++ b/backends/vulkan/runtime/graph/ops/DispatchNode.cpp @@ -46,15 +46,7 @@ void DispatchNode::encode(ComputeGraph* graph) { std::unique_lock cmd_lock = context->dispatch_lock(); - std::array push_constants_data; - uint32_t push_constants_offset = 0; - - for (const auto& push_constant : push_constants_) { - push_constants_offset += push_constant.write( - push_constants_data.data(), - push_constants_offset, - kMaxPushConstantSize); - } + write_push_constant_data(); context->report_shader_dispatch_start( shader_.kernel_name, @@ -63,7 +55,7 @@ void DispatchNode::encode(ComputeGraph* graph) { node_id_); vkapi::DescriptorSet descriptor_set = context->get_descriptor_set( - shader_, local_workgroup_size_, spec_vars_, push_constants_offset); + shader_, local_workgroup_size_, spec_vars_, push_constants_offset_); uint32_t idx = 0; idx = bind_values_to_descriptor_set( @@ -76,10 +68,20 @@ void DispatchNode::encode(ComputeGraph* graph) { pipeline_barrier, shader_, global_workgroup_size_, - push_constants_data.data(), - push_constants_offset); + push_constants_data_.data(), + push_constants_offset_); context->report_shader_dispatch_end(); } +void DispatchNode::write_push_constant_data() { + push_constants_offset_ = 0; + for (const auto& push_constant : push_constants_) { + push_constants_offset_ += push_constant.write( + push_constants_data_.data(), + push_constants_offset_, + kMaxPushConstantSize); + } +} + } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.h b/backends/vulkan/runtime/graph/ops/DispatchNode.h index c45f0a741fd..db95adfee39 100644 --- a/backends/vulkan/runtime/graph/ops/DispatchNode.h +++ b/backends/vulkan/runtime/graph/ops/DispatchNode.h @@ -50,6 +50,12 @@ class DispatchNode : public ExecuteNode { const vkapi::SpecVarList spec_vars_; const std::vector push_constants_; + // For push constants + std::array push_constants_data_{}; + uint32_t push_constants_offset_ = 0; + + void write_push_constant_data(); + public: operator bool() const { return shader_; diff --git a/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp b/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp index ac84916c6fa..a8d2fe2e99d 100644 --- a/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp +++ b/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp @@ -25,9 +25,9 @@ DynamicDispatchNode::DynamicDispatchNode( const ResizeFunction& resize_fn) : DispatchNode( graph, - pick_shader_fn(&graph, args, resize_args), - pick_global_wg_fn(&graph, args, resize_args), - pick_local_wg_fn(&graph, args, resize_args), + vkapi::ShaderInfo(), + {1u, 1u, 1u}, + {1u, 1u, 1u}, args, params, push_constants, @@ -36,13 +36,57 @@ DynamicDispatchNode::DynamicDispatchNode( resize_fn), pick_shader_fn_(pick_shader_fn), pick_global_wg_fn_(pick_global_wg_fn), + pick_local_wg_fn_(pick_local_wg_fn) { + shader_ = pick_shader_fn(&graph, args, resize_args); + global_workgroup_size_ = + pick_global_wg_fn(&graph, shader_, args, resize_args); + local_workgroup_size_ = utils::WorkgroupSize(pick_local_wg_fn( + &graph, shader_, global_workgroup_size_, args, resize_args)); +} + +DynamicDispatchNode::DynamicDispatchNode( + ComputeGraph& graph, + const vkapi::ShaderInfo& shader, + const PickGlobalFn& pick_global_wg_fn, + const PickLocalFn& pick_local_wg_fn, + const std::vector& args, + const vkapi::ParamsBindList& params, + const std::vector& push_constants, + const vkapi::SpecVarList& spec_vars, + const std::vector& resize_args, + const ResizeFunction& resize_fn) + : DispatchNode( + graph, + shader, + pick_global_wg_fn(&graph, shader, args, resize_args), + pick_local_wg_fn( + &graph, + shader, + pick_global_wg_fn(&graph, shader, args, resize_args), + args, + resize_args), + args, + params, + push_constants, + spec_vars, + resize_args, + resize_fn), + pick_shader_fn_{nullptr}, + pick_global_wg_fn_(pick_global_wg_fn), pick_local_wg_fn_(pick_local_wg_fn) {} void DynamicDispatchNode::encode(ComputeGraph* graph) { - shader_ = pick_shader_fn_(graph, args_, resize_args_); - global_workgroup_size_ = pick_global_wg_fn_(graph, args_, resize_args_); - local_workgroup_size_ = - utils::WorkgroupSize(pick_local_wg_fn_(graph, args_, resize_args_)); + if (pick_shader_fn_) { + shader_ = pick_shader_fn_(graph, args_, resize_args_); + } + if (pick_global_wg_fn_) { + global_workgroup_size_ = + pick_global_wg_fn_(graph, shader_, args_, resize_args_); + } + if (pick_local_wg_fn_) { + local_workgroup_size_ = utils::WorkgroupSize(pick_local_wg_fn_( + graph, shader_, global_workgroup_size_, args_, resize_args_)); + } DispatchNode::encode(graph); } diff --git a/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h b/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h index ede50941415..005151272c3 100644 --- a/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h +++ b/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h @@ -32,10 +32,13 @@ class DynamicDispatchNode final : public DispatchNode { const std::vector&)>; using PickGlobalFn = const std::function&, const std::vector&)>; using PickLocalFn = const std::function&, const std::vector&)>; @@ -51,6 +54,18 @@ class DynamicDispatchNode final : public DispatchNode { const std::vector& resize_args, const ResizeFunction& resize_fn = nullptr); + explicit DynamicDispatchNode( + ComputeGraph& graph, + const vkapi::ShaderInfo& shader, + const PickGlobalFn& pick_global_wg_fn, + const PickLocalFn& pick_local_wg_fn, + const std::vector& args, + const vkapi::ParamsBindList& params, + const std::vector& push_constants, + const vkapi::SpecVarList& spec_vars, + const std::vector& resize_args, + const ResizeFunction& resize_fn = nullptr); + ~DynamicDispatchNode() override = default; void encode(ComputeGraph* graph) override; diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h index 7563fc63c71..0731722e13a 100644 --- a/backends/vulkan/runtime/graph/ops/ExecuteNode.h +++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.h @@ -65,7 +65,7 @@ class ExecuteNode { (void)graph; } - inline void trigger_resize(ComputeGraph* graph) { + virtual inline void trigger_resize(ComputeGraph* graph) { if (resize_fn_ != nullptr) { resize_fn_(graph, args_, resize_args_); } diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml index 653bda9ccc0..25b3657c2eb 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml @@ -14,5 +14,6 @@ buffer_to_nchw: - VALUE: float - VALUE: int - VALUE: int8 + - VALUE: uint8 shader_variants: - NAME: buffer_to_nchw diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl index 3265a973980..0ee19206f59 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl @@ -47,11 +47,6 @@ layout(push_constant) uniform restrict Block { layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; -// For performance improvement, reduce register usage by caching positions in shared memory. -// Offset index by 1 every 16 points to avoid bank access conflict. -#define offset_pos_index(index) (index + ((index) >> 4)) -shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE)]; - /* * Computes a depthwise convolution. Each shader invocation calculates the * output at a single output location. @@ -77,8 +72,6 @@ void main() { return; } - pos_shared[offset_pos_index(gl_LocalInvocationIndex)] = pos; - // Compute the index of the top-left element of the overlay region. Negative // indices indicate that the top-left element is in a region added by padding. const ivec2 ipos = pos.xy * stride - padding; @@ -89,13 +82,10 @@ void main() { const ivec2 end = ipos + overlay_region.xy; // sum outputs - VEC4_T sum[BATCH_SIZE_Y][BATCH_SIZE_X]; + VEC4_T sum[BATCH_SIZE_Y * BATCH_SIZE_X]; - sum[0][0] = texelFetch(t_bias, ivec2(pos.z, 0), 0); - for (int y = 0; y < BATCH_SIZE_Y; y++) { - for (int x = 0; x < BATCH_SIZE_X; x++) { - sum[y][x] = sum[0][0]; - } + for (int i = 0; i < BATCH_SIZE_Y * BATCH_SIZE_X; i++) { + sum[i] = VEC4_T(0); } // array to store input texels @@ -115,7 +105,7 @@ void main() { if (i > 0) { for (int j = 0; j < TILE_SIZE; j++) { for (int s = 0; s < BATCH_SIZE_X; s++) { - sum[1][s] = fma(in_texels[j + s], prev_kernel_line[j], sum[1][s]); + sum[BATCH_SIZE_X + s] = fma(in_texels[j + s], prev_kernel_line[j], sum[BATCH_SIZE_X + s]); } } } @@ -125,19 +115,19 @@ void main() { for (int j = 0; j < TILE_SIZE; j++, kx++) { prev_kernel_line[j] = texelFetch(t_kernel, ivec2(kx, pos.z), 0); for (int s = 0; s < BATCH_SIZE_X; s++) { - sum[0][s] = fma(in_texels[j + s], prev_kernel_line[j], sum[0][s]); + sum[s] = fma(in_texels[j + s], prev_kernel_line[j], sum[s]); } } } } - const ivec3 out_pos = pos_shared[offset_pos_index(gl_LocalInvocationIndex)]; + const VEC4_T bias = texelFetch(t_bias, ivec2(pos.z, 0), 0); for (int y = 0; y < BATCH_SIZE_Y; y++) { for (int x = 0; x < BATCH_SIZE_X; x++) { - if (any(greaterThanEqual(ivec3(out_pos.x + x, out_pos.y + y, out_pos.z), out_limits.xyz))) { - continue; + const ivec3 out_pos = ivec3(pos.x + x, pos.y + y, pos.z); + if (all(lessThan(out_pos.xy, out_limits.xy))) { + imageStore(t_out, out_pos, op(sum[y * BATCH_SIZE_X + x] + bias, out_min, out_max)); } - imageStore(t_out, ivec3(out_pos.x + x, out_pos.y + y, out_pos.z), op(sum[y][x], out_min, out_max)); } } } diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl index c218b8ac8cc..cf9714ca468 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl @@ -46,7 +46,9 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; * size is only 1x1, making it easier to re-use loaded texels from t_kernel. */ void main() { - const int out_limits_scaled[2] = {out_limits.x + (TILE_SIZE_X - 1) * TILE_SIZE_X, out_limits.y + (TILE_SIZE_Y - 1) * TILE_SIZE_Y}; + const int out_limits_scaled[2] = + {(out_limits.x + (TILE_SIZE_X - 1)) / TILE_SIZE_X, + (out_limits.y + (TILE_SIZE_Y - 1)) / TILE_SIZE_Y}; const int div_by_x = int(gl_GlobalInvocationID.x / out_limits_scaled[0]); const int out_pos[3] = {int(gl_GlobalInvocationID.x % out_limits_scaled[0]), div_by_x, int(gl_GlobalInvocationID.y)}; diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl index 8ed35d84d0e..a46f1e3b99c 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl @@ -48,14 +48,17 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; * size is only 1x1, making it easier to re-use loaded texels from t_kernel. */ void main() { - const int out_limits_scaled[2] = {out_limits.x + (TILE_SIZE_X - 1) * TILE_SIZE_X, out_limits.y + (TILE_SIZE_Y - 1) * TILE_SIZE_Y}; + const int out_limits_scaled[2] = + {(out_limits.x + (TILE_SIZE_X - 1)) / TILE_SIZE_X, + (out_limits.y + (TILE_SIZE_Y - 1)) / TILE_SIZE_Y}; - const int div_by_x = int(gl_GlobalInvocationID.x / out_limits_scaled[0]); - const int out_pos[3] = {int(gl_GlobalInvocationID.x % out_limits_scaled[0]), div_by_x, int(gl_GlobalInvocationID.y)}; + const uint16_t div_by_x = uint16_t(gl_GlobalInvocationID.x / out_limits_scaled[0]); + const uint16_t out_pos_xy[2] = {uint16_t(gl_GlobalInvocationID.x % out_limits_scaled[0]), div_by_x}; + const int out_pos_z = int(gl_GlobalInvocationID.y); // If the top left position is out of bounds, then this invocation will have // no work to do. - if (out_pos[1] >= out_limits_scaled[1] || out_pos[2] >= out_limits.z) { + if (out_pos_xy[1] >= out_limits_scaled[1] || out_pos_z >= out_limits.z) { return; } @@ -68,8 +71,8 @@ void main() { uint16_t pos[TILE_SIZE_X * TILE_SIZE_Y * 2]; for (uint16_t y = uint16_t(0), i = uint16_t(0); y < TILE_SIZE_Y; ++y) { for (uint16_t x = uint16_t(0); x < TILE_SIZE_X; ++x) { - pos[i * 2] = uint16_t(out_pos[0]) * TILE_SIZE_X + x; - pos[i * 2 + 1] = uint16_t(out_pos[1]) * TILE_SIZE_Y + y; + pos[i * 2] = out_pos_xy[0] * TILE_SIZE_X + x; + pos[i * 2 + 1] = out_pos_xy[1] * TILE_SIZE_Y + y; i++; } } @@ -78,14 +81,9 @@ void main() { // Tuple of consecutive 4 elements represents a single output texel. float sum[TILE_SIZE_X * TILE_SIZE_Y * 4]; - const vec4 bias = texelFetch(t_bias, ivec2(out_pos[2], 0), 0); - // Initialize the output array with the bias value - for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y * 4; i += 4) { - sum[i] = bias.x; - sum[i + 1] = bias.y; - sum[i + 2] = bias.z; - sum[i + 3] = bias.w; + for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y * 4; i++) { + sum[i] = 0; } int z4 = 0; @@ -98,7 +96,7 @@ void main() { // Load kernel values from texels to array [[unroll]] for (int i = 0; i < 4; ++i) { - const vec4 k_tex = texelFetch(t_kernel, ivec2(z + i, out_pos[2]), 0); + const vec4 k_tex = texelFetch(t_kernel, ivec2(z + i, out_pos_z), 0); kernel_values[i * 4 + 0] = k_tex.x; kernel_values[i * 4 + 1] = k_tex.y; kernel_values[i * 4 + 2] = k_tex.z; @@ -156,10 +154,13 @@ void main() { } } + const vec4 bias = texelFetch(t_bias, ivec2(out_pos_z, 0), 0); + for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) { - const ivec3 pos_l = ivec3(pos[i * 2], pos[i * 2 + 1], out_pos[2]); - if (all(lessThan(pos_l, out_limits.xyz))) { - imageStore(t_out, pos_l, op(vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]), out_min, out_max)); + const ivec3 pos_l = ivec3(pos[i * 2], pos[i * 2 + 1], out_pos_z); + if (all(lessThan(pos_l.xy, out_limits.xy))) { + const vec4 out_sum = vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]); + imageStore(t_out, pos_l, op(out_sum + bias, out_min, out_max)); } } } diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml index 8fc9340d9d0..c1045d93afc 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml @@ -15,6 +15,7 @@ image_to_nchw: - VALUE: float - VALUE: int - VALUE: int8 + - VALUE: uint8 shader_variants: - NAME: image_to_nchw_texture3d - NAME: image_to_nchw_texture2d diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl index 327c3868847..4b18abbb1c5 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl @@ -22,7 +22,13 @@ layout(std430) buffer; ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} ${layout_declare_buffer(B, "r", "nchw_in", "int")} -${layout_declare_ubo(B, "ivec4", "sizes")} + +$if USE_PUSH_CONST: + layout(push_constant) uniform restrict Block { + ivec4 sizes; + }; +$else: + ${layout_declare_ubo(B, "ivec4", "sizes")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml index 506a66c0d27..0b8bbecb7bd 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml @@ -8,6 +8,7 @@ nchw_to_bitw8_image_nobitw8buffer: parameter_names_with_default_values: STORAGE: texture3d DTYPE: int8 + USE_PUSH_CONST: True generate_variant_forall: STORAGE: - VALUE: texture2d @@ -17,3 +18,5 @@ nchw_to_bitw8_image_nobitw8buffer: - VALUE: uint8 shader_variants: - NAME: nchw_to_bitw8_image_nobitw8buffer + - NAME: nchw_to_bitw8_image_nobitw8buffer_no_pc + USE_PUSH_CONST: False diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl index 32235a9ad65..ba4e4dd9dd9 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl @@ -12,9 +12,17 @@ layout(std430) buffer; ${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} ${layout_declare_tensor(1, "r", "nchw_in", DTYPE, STORAGE)} -${layout_declare_ubo(2, "ivec4", "out_sizes")} -${layout_declare_ubo(3, "ivec4", "out_strides")} -${layout_declare_ubo(4, "int", "numel")} + +$if USE_PUSH_CONST: + layout(push_constant) uniform restrict Block { + ivec4 out_sizes; + ivec4 out_strides; + int numel; + }; +$else: + ${layout_declare_ubo(2, "ivec4", "out_sizes")} + ${layout_declare_ubo(3, "ivec4", "out_strides")} + ${layout_declare_ubo(4, "int", "numel")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml index 6292ef93337..486d710cf55 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml @@ -8,11 +8,15 @@ nchw_to_buffer: parameter_names_with_default_values: DTYPE: float STORAGE: buffer + USE_PUSH_CONST: True generate_variant_forall: DTYPE: - VALUE: half - VALUE: float - VALUE: int - VALUE: int8 + - VALUE: uint8 shader_variants: - NAME: nchw_to_buffer + - NAME: nchw_to_buffer_no_pc + USE_PUSH_CONST: False diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl index 2f55535c82c..4674822ce6a 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl @@ -21,9 +21,17 @@ layout(std430) buffer; ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} ${layout_declare_buffer(B, "r", "buf_in", DTYPE)} -${layout_declare_ubo(B, "ivec4", "sizes")} -$if not FROM_STAGING: - ${layout_declare_ubo(B, "ivec4", "buf_strides")} + +$if USE_PUSH_CONST: + layout(push_constant) uniform restrict Block { + ivec4 sizes; + $if not FROM_STAGING: + ivec4 buf_strides; + }; +$else: + ${layout_declare_ubo(B, "ivec4", "sizes")} + $if not FROM_STAGING: + ${layout_declare_ubo(B, "ivec4", "buf_strides")} #include "indexing_utils.h" diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml index f44e1f74bfe..7e52ec10376 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml @@ -9,15 +9,25 @@ nchw_to_image: STORAGE: texture3d DTYPE: float FROM_STAGING: True + USE_PUSH_CONST: True generate_variant_forall: DTYPE: - VALUE: half - VALUE: float - VALUE: int - VALUE: int8 + - VALUE: uint8 shader_variants: - NAME: nchw_to_image_texture3d - NAME: nchw_to_image_texture2d STORAGE: texture2d - NAME: clone_buffer_to_image FROM_STAGING: False + - NAME: nchw_to_image_no_pc_texture3d + USE_PUSH_CONST: False + - NAME: nchw_to_image_no_pc_texture2d + STORAGE: texture2d + USE_PUSH_CONST: False + - NAME: clone_buffer_to_image_no_pc + FROM_STAGING: False + USE_PUSH_CONST: False diff --git a/backends/vulkan/runtime/graph/ops/glsl/select.glslh b/backends/vulkan/runtime/graph/ops/glsl/select.glslh new file mode 100644 index 00000000000..3bcbf04a3ba --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/select.glslh @@ -0,0 +1,74 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifndef SELECT_GLSLH +#define SELECT_GLSLH + +/* + * Enable the fast path if a texel loaded from the input texture can be used as + * is to store to the output texture. The following conditions must be met: + * + * 1. The input and output textures have the same packed dimension. + * 2. The selected_dim must not be the packed dimension of the input. + * 3. The packed dimension of the input must "map" to the packed dimension of + * the output. This occurs if selected_dim is greater than the packed dimension + * of the input. + */ +bool can_use_fast_path() { + if (out_packed_dim != in_packed_dim) { + return false; + } + if (selected_dim <= in_packed_dim) { + return false; + } + return true; +} + +/* + * Given an output tensor index, return the corresponding input tensor index for + * the select operator. This is done by "inserting" the select index at the + * selected_dim in the input tensor index. + * + * A simple example is (note all tensor index are in WHCN order): + * out_tidx = [7, 5, 9] + * selected_dim = 2 + * index = 3 + * in_tidx = [7, 3, 5, 9] + * + * This function assumes that the following variables are defined in the layout: + * - in_sizes + * - selected_dim + * - index + */ +ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) { + ivec4 in_tidx = ivec4(0); + + int adjusted_index = index; + if (index < 0) { + adjusted_index = index + in_sizes[selected_dim]; + } + + // Handle different dimensions for selection + if (selected_dim == 0) { + // Select from width dimension + in_tidx = ivec4(adjusted_index, out_tidx.x, out_tidx.y, out_tidx.z); + } else if (selected_dim == 1) { + // Select from height dimension + in_tidx = ivec4(out_tidx.x, adjusted_index, out_tidx.y, out_tidx.z); + } else if (selected_dim == 2) { + // Select from channel dimension + in_tidx = ivec4(out_tidx.x, out_tidx.y, adjusted_index, out_tidx.z); + } else if (selected_dim == 3) { + // Select from batch dimension + in_tidx = ivec4(out_tidx.x, out_tidx.y, out_tidx.z, adjusted_index); + } + + return in_tidx; +} + +#endif // SELECT_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.glsl deleted file mode 100644 index f94e1120492..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.glsl +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -layout(std430) buffer; - -#include "indexing_utils.h" - -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out; -layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in; - -layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits { - ivec3 out_limits; -}; - -layout(set = 0, binding = 3) uniform PRECISION restrict Sizes { - ivec4 sizes; -}; - -layout(set = 0, binding = 4) uniform PRECISION restrict SelectVal { - // data.x: index along batch dim to select - // data.y: number of batches - // data.z: number of texels per batch - // data.w: unused - ivec4 select_info; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const int num_batches = select_info.y; - const int num_texel_per_batch = select_info.z; - const int index = select_info.x; - - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - const uint src_pos_z = (num_texel_per_batch * index) + pos.z; - imageStore( - image_out, pos, texelFetch(image_in, ivec3(pos.x, pos.y, src_pos_z), 0)); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.glsl deleted file mode 100644 index 0bbec798484..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.glsl +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} -#define T ${texel_component_type(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out; -layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in; - -layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits { - ivec3 out_limits; -}; - -layout(set = 0, binding = 3) uniform PRECISION restrict Sizes { - ivec4 sizes; -}; - -// index to select -layout(set = 0, binding = 4) uniform PRECISION restrict IndexVal { - int index; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - const int tex = index / 4; - const int ind = index % 4; - const T v = VEC4_T(texelFetch(image_in, ivec3(pos.x, pos.y, tex), 0))[ind]; - - imageStore(image_out, ivec3(pos.x, pos.y, 0), VEC4_T(v, 0, 0, 0)); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.glsl deleted file mode 100644 index 517362f76ea..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.glsl +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out; -layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in; - -layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits { - ivec3 out_limits; -}; - -layout(set = 0, binding = 3) uniform PRECISION restrict Sizes { - ivec4 sizes; -}; - -layout(set = 0, binding = 4) uniform PRECISION restrict SelectVal { - // data.x: index along channel dim to select - // data.y: number of batches - // data.z: number of texels per batch - // data.w: unused - ivec4 select_info; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - const int num_batches = select_info.y; - const int num_texel_per_batch = select_info.z; - const int index = select_info.x; - - // read in the same channel from 4 separate batches - VEC4_T out_texel = VEC4_T(0, 0, 0, 0); - for (int k = 0; k < 4; k++) { - if ((k + pos.z * 4) >= - num_batches) { - break; - } - const uint src_pos_z = (4 * num_texel_per_batch * pos.z) + - (k * num_texel_per_batch) + (index / 4); - const uint src_pos_t = index % 4; - out_texel[k] = - VEC4_T(texelFetch(image_in, ivec3(pos.x, pos.y, src_pos_z), 0))[src_pos_t]; - } - - imageStore(image_out, pos, out_texel); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.glsl deleted file mode 100644 index 87409fb35fd..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.glsl +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out; -layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in; - -layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits { - ivec3 out_limits; -}; - -layout(set = 0, binding = 3) uniform PRECISION restrict Sizes { - ivec4 sizes; -}; - -// index to select -layout(set = 0, binding = 4) uniform PRECISION restrict IndexVal { - int index; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - // w - const int src_x = pos.x; - // h - const int src_y = index; - // c - const int src_z = pos.y; - - const VEC4_T v = VEC4_T(texelFetch(image_in, ivec3(src_x, src_y, src_z), 0)); - - for (int i = 0; i < 4; i++) { - ivec3 new_pos = ivec3(pos.x, pos.y * 4 + i, 0); - - // When the C-channel exceeds original block size, exit early - if (new_pos.y >= sizes.y) { - return; - } - - imageStore(image_out, new_pos, VEC4_T(v[i], 0, 0, 0)); - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.yaml deleted file mode 100644 index a373f1decd9..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.yaml +++ /dev/null @@ -1,10 +0,0 @@ -select_height_3d: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: select_height_3d diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.glsl deleted file mode 100644 index 2e4e2afb2db..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.glsl +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out; -layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in; - -layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits { - ivec3 out_limits; -}; - -layout(set = 0, binding = 3) uniform PRECISION restrict Sizes { - ivec4 sizes; -}; - -// index to select -layout(set = 0, binding = 4) uniform PRECISION restrict IndexVal { - // data.x: index along height dim to select - // data.y: number of batches - // data.z: number of texels per batch - // data.w: unused - ivec4 select_info; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - const int num_batches = select_info.y; - const int num_texel_per_batch = select_info.z; - const int index = select_info.x; - - VEC4_T out_texel = VEC4_T(0, 0, 0, 0); - // read in the same channel from 4 separate batches - for (int k = 0; k < 4; k++) { - if ((k + pos.z * 4) >= num_batches - ) { // < 4 batches for this texel, exit early - break; - } - const uint src_pos_z = (pos.z * num_texel_per_batch * 4) + - k * num_texel_per_batch + (pos.y / 4); - out_texel[k] = VEC4_T(texelFetch( - image_in, ivec3(pos.x, index, src_pos_z), 0))[pos.y % 4]; - } - imageStore(image_out, pos, out_texel); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.yaml deleted file mode 100644 index c3724f1157a..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.yaml +++ /dev/null @@ -1,10 +0,0 @@ -select_height_4d: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: select_height_4d diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.glsl deleted file mode 100644 index 1e12d15ab21..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.glsl +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out; -layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in; - -layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits { - ivec3 out_limits; -}; - -layout(set = 0, binding = 3) uniform PRECISION restrict Sizes { - ivec4 sizes; -}; - -// index to select -layout(set = 0, binding = 4) uniform PRECISION restrict IndexVal { - int index; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - // w - const int src_x = index; - // h - const int src_y = pos.x; - // c - const int src_z = pos.y; - - const VEC4_T v = VEC4_T(texelFetch(image_in, ivec3(src_x, src_y, src_z), 0)); - - for (int i = 0; i < 4; i++) { - ivec3 new_pos = ivec3(pos.x, pos.y * 4 + i, 0); - - // When the C-channel exceeds original block size, exit early - if (new_pos.y >= sizes.y) { - return; - } - - imageStore(image_out, new_pos, VEC4_T(v[i], 0, 0, 0)); - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.yaml deleted file mode 100644 index a3070bf6ca3..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.yaml +++ /dev/null @@ -1,10 +0,0 @@ -select_width_3d: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: select_width_3d diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.glsl deleted file mode 100644 index ffbd8afbda0..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.glsl +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out; -layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in; - -layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits { - ivec3 out_limits; -}; - -layout(set = 0, binding = 3) uniform PRECISION restrict Sizes { - ivec4 sizes; -}; - -// index to select -layout(set = 0, binding = 4) uniform PRECISION restrict SelectVal { - // data.x: index along width dim to select - // data.y: number of batches - // data.z: number of texels per batch - // data.w: unused - ivec4 select_info; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - const int num_batches = select_info.y; - const int num_texel_per_batch = select_info.z; - const int index = select_info.x; - - //vec4 out_texel = vec4(0, 0, 0, 0); - VEC4_T out_texel = VEC4_T(0, 0, 0, 0); - // read in the same channel from 4 separate batches - for (int k = 0; k < 4; k++) { - if ((k + pos.z * 4) >= - num_batches) { // < 4 batches for this texel, exit early - break; - } - const uint src_pos_z = (pos.z * num_texel_per_batch * 4) + - k * num_texel_per_batch + (pos.y / 4); - - out_texel[k] = VEC4_T(texelFetch( - image_in, ivec3(index, pos.x, src_pos_z), 0))[pos.y % 4]; - } - imageStore(image_out, pos, out_texel); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.yaml deleted file mode 100644 index f1131d77395..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.yaml +++ /dev/null @@ -1,10 +0,0 @@ -select_width_4d: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: select_width_4d diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh new file mode 100644 index 00000000000..5d4cc70fdc1 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh @@ -0,0 +1,53 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifndef SLICE_GLSLH +#define SLICE_GLSLH + +/** + * Enable the fast path if a texel loaded from the input texture can be used as + * is to store to the output texture. The following conditions must be met: + * + * 1. The input and output textures have the same packed dimension. + * 2. The select_dim must not be the packed dimension of the input. + */ +bool can_use_fast_path() { + if (out_packed_dim != in_packed_dim) { + return false; + } + if (in_packed_dim == selected_dim) { + return false; + } + return true; +} + +/* + * Converts output tensor indices to input tensor indices for the slice operation. + * This function maps the output indices to the corresponding input indices based on + * the slice parameters (start, step, selected_dim). + * + * Parameters assumed to be defined in the layout specifier: + * - in_sizes + * - selected_dim + * - start + * - step + */ +ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) { + ivec4 in_tidx = out_tidx; + + int adjusted_start = start; + if (start < 0) { + adjusted_start = start + in_sizes[selected_dim]; + } + + in_tidx[selected_dim] = adjusted_start + out_tidx[selected_dim] * step; + + return in_tidx; +} + +#endif // SLICE_GLSLH diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.glsl b/backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.glsl deleted file mode 100644 index 0a6fa31a65f..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.glsl +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_ubo(2, "ivec4", "out_sizes")} -${layout_declare_ubo(3, "ivec4", "in_sizes")} - -layout(set = 0, binding = 4) uniform PRECISION restrict SliceArg { - int offset; - int step; -} -slice_arg; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = C_DIM; - -void main() { - const ivec3 out_pos = ivec3(gl_GlobalInvocationID); - const ivec4 idx = to_tensor_idx(out_pos, out_sizes, packed_dim); - - if (any(greaterThanEqual(idx, out_sizes))) { - return; - } - - // We map the output pos using the buffer index. For each index in the texel, - // we calculate the source whcn-coordinate amended with offset-ed channel - // value. Then we calculate the actual texture position from the - // whcn-coordinate. - const ivec4 buf_indices = tidx_to_nchwi(idx, out_sizes, packed_dim); - - vec4 outex; - for (int i=0;i<4;i++) { - ivec4 user_coor = nchwi_to_tidx(buf_indices[i], out_sizes); - - int in_dim = user_coor[packed_dim]; - - ivec4 in_user_coor = user_coor; - in_user_coor[packed_dim] = slice_arg.offset + in_dim * slice_arg.step; - - ivec4 in_pow_elem = to_texture_elem_pos( - in_user_coor, - in_sizes, - packed_dim); - - vec4 v = texelFetch(t_in, in_pow_elem.xyz, 0); - - outex[i] = v[in_pow_elem.w]; - } - imageStore(t_out, out_pos, outex); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.yaml b/backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.yaml deleted file mode 100644 index 718e7316824..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.yaml +++ /dev/null @@ -1,11 +0,0 @@ -slice_packed_dim: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - STORAGE: texture3d - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: slice_packed_dim diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.glsl b/backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.glsl deleted file mode 100644 index 54f0bd0b78c..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.glsl +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out; -layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in; - -layout(set = 0, binding = 2) uniform PRECISION restrict Sizes { - ivec4 sizes; -}; - -layout(set = 0, binding = 3) uniform PRECISION restrict SliceArg { - int dim; - int offset; - int step; - int image_in_channel_size; -} -slice_arg; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -layout(constant_id = 3) const int packed_dim = C_DIM; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (pos_out_of_bounds(pos, sizes, packed_dim)) { - return; - } - - ivec3 in_pos = pos; - - // slice along batch axis - if (slice_arg.dim == 3) { - // index of the channel inside a batch - const int chanl_index = pos.z % slice_arg.image_in_channel_size; - // index of batch - const int batch_index = pos.z / slice_arg.image_in_channel_size; - in_pos.z = (slice_arg.offset + batch_index * slice_arg.step) * slice_arg.image_in_channel_size + chanl_index; - } else if (slice_arg.dim == C_DIM) { - // index of the channel inside a batch - const int chanl_index = pos.z % sizes.z; - // index of batch - const int batch_index = pos.z / sizes.z; - in_pos.z = slice_arg.offset + batch_index * slice_arg.image_in_channel_size + chanl_index * slice_arg.step; - } else if (slice_arg.dim == H_DIM) { - in_pos.y = slice_arg.offset + pos.y * slice_arg.step; - } else { - in_pos.x = slice_arg.offset + pos.x * slice_arg.step; - } - - imageStore(image_out, pos, texelFetch(image_in, in_pos, 0)); - -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.yaml b/backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.yaml deleted file mode 100644 index 0453bb707b1..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.yaml +++ /dev/null @@ -1,10 +0,0 @@ -slice_unpacked_dim: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - generate_variant_forall: - DTYPE: - - VALUE: half - - VALUE: float - shader_variants: - - NAME: slice_unpacked_dim diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl new file mode 100644 index 00000000000..3ca854e0526 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl @@ -0,0 +1,58 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define VEC4_T ${texel_type(DTYPE)} +#define T ${buffer_scalar_type(DTYPE)} + +${define_active_storage_type("buffer")} +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +#include "indexing_utils.h" +${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")} +${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")} + +$if OP_NAME == "slice": + ${layout_declare_ubo(B, "int", "start")} + ${layout_declare_ubo(B, "int", "step")} + +$if OP_NAME == "select": + ${layout_declare_ubo(B, "int", "index")} + +layout(push_constant) uniform restrict Block { + ivec4 in_sizes; + ivec4 out_strides; + ivec4 in_strides; + int out_numel; + int selected_dim; +}; + +${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")} +${layout_declare_spec_const(C, "int", "in_packed_dim", "DEFAULT_LAYOUT")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +#include "${OP_NAME}.glslh" + +void main() { + const int out_bufi = ivec3(gl_GlobalInvocationID).x; + if (out_bufi >= out_numel) { + return; + } + + const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim); + ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); + + const int in_bufi = tidx_to_bufi(in_tidx, in_strides); + t_out[out_bufi] = t_in[in_bufi]; +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.yaml similarity index 54% rename from backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.yaml rename to backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.yaml index 1c5c4e34b06..bdde613c8ce 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.yaml @@ -1,10 +1,13 @@ -select_channel_3d: +transfer_buffer: parameter_names_with_default_values: DTYPE: float - NDIM: 3 + OP_NAME: select generate_variant_forall: DTYPE: - VALUE: half - VALUE: float shader_variants: - - NAME: select_channel_3d + - NAME: select_buffer + OP_NAME: select + - NAME: slice_buffer + OP_NAME: slice diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl new file mode 100644 index 00000000000..d3e25436c04 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl @@ -0,0 +1,83 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define VEC4_T ${texel_type(DTYPE)} +#define T ${buffer_scalar_type(DTYPE)} + +${define_active_storage_type("texture3d")} +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +#include "indexing_utils.h" + +${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")} +${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")} + +$if OP_NAME == "slice": + ${layout_declare_ubo(B, "int", "start")} + ${layout_declare_ubo(B, "int", "step")} + +$if OP_NAME == "select": + ${layout_declare_ubo(B, "int", "index")} + +layout(push_constant) uniform restrict Block { + ivec4 out_sizes; + ivec4 in_sizes; + int selected_dim; +}; + +${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} +const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); +const lowp int out_packed_dim = unhash_packed_dim(out_layout); + +${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} +const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); +const lowp int in_packed_dim = unhash_packed_dim(in_layout); + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +#include "${OP_NAME}.glslh" + +void main() { + const ivec3 lpos = ivec3(gl_GlobalInvocationID); + ivec4 out_tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, out_packed_dim); + + if (any(greaterThanEqual(out_tidx, out_sizes))) { + return; + } + + if (can_use_fast_path()) { + ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); + ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim); + VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos)); + + write_texel_lpos(t_out, lpos, in_texel, out_axis_map); + } + else { + VEC4_T out_texel = VEC4_T(0); + for (int texel_i = 0; texel_i < 4; ++texel_i) { + ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); + ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim); + int element_idx = in_tidx[in_packed_dim] % 4; + + VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos)); + T selected_value = T(in_texel[element_idx]); + + out_texel[texel_i] = selected_value; + + out_tidx[out_packed_dim]++; + } + + write_texel_lpos(t_out, lpos, out_texel, out_axis_map); + } +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.yaml b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.yaml similarity index 52% rename from backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.yaml rename to backends/vulkan/runtime/graph/ops/glsl/transfer_texture.yaml index 6236555f5dd..f877ee036e4 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.yaml @@ -1,10 +1,13 @@ -select_channel_4d: +transfer_texture: parameter_names_with_default_values: DTYPE: float - NDIM: 3 + OP_NAME: select generate_variant_forall: DTYPE: - VALUE: half - VALUE: float shader_variants: - - NAME: select_channel_4d + - NAME: select_texture3d + OP_NAME: select + - NAME: slice_texture3d + OP_NAME: slice diff --git a/backends/vulkan/runtime/graph/ops/glsl/where.glsl b/backends/vulkan/runtime/graph/ops/glsl/where.glsl new file mode 100644 index 00000000000..5df813d1241 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/where.glsl @@ -0,0 +1,111 @@ +// where.glsl + +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + + +#version 450 core + +#define PRECISION ${PRECISION} + +#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} +#define T ${buffer_scalar_type(DTYPE)} +#define COND_T ${buffer_scalar_type("bool")} + +${define_active_storage_type(STORAGE)} +${define_required_extensions(DTYPE)} +${define_required_extensions("bool")} + +layout(std430) buffer; + +${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_tensor(B, "r", "t_condition", "bool", STORAGE)} +${layout_declare_tensor(B, "r", "t_self", DTYPE, STORAGE)} +${layout_declare_tensor(B, "r", "t_other", DTYPE, STORAGE)} + + +#include "indexing_utils.h" + +$if STORAGE == "buffer": + ${layout_declare_ubo(B, "int", "out_numl")} + ${layout_declare_ubo(B, "ivec4", "out_strides")} + ${layout_declare_ubo(B, "ivec4", "cond_strides")} + ${layout_declare_ubo(B, "ivec4", "self_strides")} + ${layout_declare_ubo(B, "ivec4", "other_strides")} + + ${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")} + ${layout_declare_spec_const(C, "int", "cond_packed_dim", "DEFAULT_LAYOUT")} + ${layout_declare_spec_const(C, "int", "self_packed_dim", "DEFAULT_LAYOUT")} + ${layout_declare_spec_const(C, "int", "other_packed_dim", "DEFAULT_LAYOUT")} +$else: + ${layout_declare_ubo(B, "ivec3", "out_limits")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +#ifdef USING_BUFFER + +void main() { + int out_bufi = int(gl_GlobalInvocationID.x); + // ivec4 tidx = ivec4(gl_GlobalInvocationID, 0); + // int out_bufi = tidx_to_bufi(tidx, out_strides); + // int cond_bufi = tidx_to_bufi(tidx, cond_strides); + // int self_bufi = tidx_to_bufi(tidx, self_strides); + // int other_bufi = tidx_to_bufi(tidx, other_strides); + if (out_bufi >= out_numl) { + return; + } + + const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim); + out_bufi = tidx_to_bufi(out_tidx, out_strides); + + const ivec4 cond_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim); + const int cond_bufi = tidx_to_bufi(cond_tidx, cond_strides); + + const ivec4 self_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim); + const int self_bufi = tidx_to_bufi(self_tidx, self_strides); + + const ivec4 other_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim); + const int other_bufi = tidx_to_bufi(other_tidx, other_strides); + + COND_T cond = t_condition[cond_bufi] ; + T v_self = t_self[self_bufi]; + T v_other = t_other[other_bufi]; + + if (cond > 0) { + t_out[out_bufi] = v_self; + } else { + t_out[out_bufi] = v_other; + } +} + +#else // !USING_BUFFER + +void main() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + + + if (any(greaterThanEqual(pos, out_limits))) { + return; + } + + vec4 cond = load_texel(t_condition, pos); + VEC4_T selftex = load_texel(t_self, pos); + VEC4_T othertex = load_texel(t_other, pos); + + VEC4_T outtex; + + for (int idx = 0; idx < 4; ++idx) { + if (cond[idx] == 1) { + outtex[idx] = selftex[idx]; + } else { + outtex[idx] = othertex[idx]; + } + } + write_texel(t_out, pos, outtex); +} + #endif // !USING_BUFFER diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.yaml b/backends/vulkan/runtime/graph/ops/glsl/where.yaml similarity index 64% rename from backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.yaml rename to backends/vulkan/runtime/graph/ops/glsl/where.yaml index 9c7d54c8f69..edbd843a336 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/where.yaml @@ -1,10 +1,12 @@ -select_batch_4d: +where: parameter_names_with_default_values: DTYPE: float - NDIM: 3 generate_variant_forall: + STORAGE: + - VALUE: texture3d + - VALUE: buffer DTYPE: - VALUE: half - VALUE: float shader_variants: - - NAME: select_batch_4d + - NAME: where diff --git a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp index b547bc3572d..d0276b1783b 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp @@ -105,9 +105,9 @@ void add_buffer_to_image_node( // Input and Outputs {{image, vkapi::kWrite}, {buffer, vkapi::kRead}}, // Parameter Buffers - {graph.sizes_ubo(image), graph.strides_ubo(buffer)}, - // Push Constants {}, + // Push Constants + {graph.sizes_pc_of(image), graph.strides_pc_of(buffer)}, // Specialization Constants {graph.hashed_layout_of(image)}, // Resize Args diff --git a/backends/vulkan/runtime/graph/ops/impl/Common.cpp b/backends/vulkan/runtime/graph/ops/impl/Common.cpp new file mode 100644 index 00000000000..4de099231d3 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Common.cpp @@ -0,0 +1,33 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace vkcompute { + +utils::uvec3 default_pick_global_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& additional_args) { + (void)shader; + const ValueRef out = args.at(0).refs.at(0); + return graph->create_global_wg_size(out); +} + +utils::uvec3 default_pick_local_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const utils::uvec3& global_workgroup_size, + const std::vector& args, + const std::vector& additional_args) { + (void)shader; + return graph->create_local_wg_size(global_workgroup_size); +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Common.h b/backends/vulkan/runtime/graph/ops/impl/Common.h new file mode 100644 index 00000000000..d5ff455ae41 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Common.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace vkcompute { + +/** + * Creates a global workgroup size based on the first output tensor in the args. + * This is a utility function that extracts the output tensor from + * args.at(0).refs.at(0) and calls graph->create_global_wg_size(out) on it. + * + * @param graph The ComputeGraph instance + * @param args Vector of ArgGroup containing the output tensor reference + * @return utils::uvec3 The global workgroup size + */ +utils::uvec3 default_pick_global_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& additional_args); + +/** + * Creates a local workgroup size based on the first output tensor in the args. + * This is a utility function that extracts the output tensor from + * args.at(0).refs.at(0) and calls graph->create_local_wg_size(out) on it. + * + * @param graph The ComputeGraph instance + * @param args Vector of ArgGroup containing the output tensor reference + * @return utils::uvec3 The local workgroup size + */ +utils::uvec3 default_pick_local_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const utils::uvec3& global_workgroup_size, + const std::vector& args, + const std::vector& additional_args); + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp index fbe4a61befc..32f478fa5bd 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp @@ -106,9 +106,10 @@ ValueRef prepack_biases( graph.create_local_wg_size(v), vref, v, - {t->sizes_ubo()}, + {}, // Specialization constants - {t->hashed_layout()})); + {t->hashed_layout()}, + {graph.sizes_pc_of(v)})); return v; } diff --git a/backends/vulkan/runtime/graph/ops/impl/Select.cpp b/backends/vulkan/runtime/graph/ops/impl/Select.cpp index a83e986e414..69d49e8283b 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Select.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Select.cpp @@ -8,129 +8,136 @@ #include -#include -#include - -#include +#include #include #include namespace vkcompute { -void check_args( - const api::vTensor& t_in, - int64_t dim, - int64_t index, - const api::vTensor& t_out) { - VK_CHECK_COND(check_packed_dim_is(t_in, WHCN::kChannelsDim)); - VK_CHECK_COND(check_packed_dim_is(t_out, WHCN::kChannelsDim)); +void resize_select_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + ValueRef out = args.at(0).refs.at(0); + ValueRef in = args.at(1).refs.at(0); + int64_t dim = graph->extract_scalar(extra_args.at(0)); - const int64_t in_dim = t_in.dim(); - VK_CHECK_COND( - in_dim == 3 || in_dim == 4, - "Vulkan select only support 3d or 4d tensors!"); - - const int64_t in_size = t_in.size(dim); - - if (index < -in_size || index >= in_size) { - VK_CHECK_COND( - false, - "select(): index ", - index, - " t_outof range for tensor of size ", - in_size, - " at dimension ", - dim); + int64_t in_ndim = graph->dim_of(in); + + if (dim < 0) { + dim += in_ndim; + } + + std::vector new_out_sizes; + for (int64_t i = 0; i < in_ndim; ++i) { + if (i != dim) { + new_out_sizes.push_back(graph->size_at(i, in)); + } } + + graph->virtual_resize(out, new_out_sizes); } -void add_select_int_node( +void check_select_args( ComputeGraph& graph, const ValueRef in, const ValueRef dim_ref, const ValueRef index_ref, const ValueRef out) { - vTensorPtr t_in = graph.get_tensor(in); - vTensorPtr t_out = graph.get_tensor(out); int64_t dim = graph.extract_scalar(dim_ref); - int64_t index = graph.extract_scalar(index_ref); + int64_t index = graph.extract_optional_scalar(index_ref, 0); + int64_t in_ndim = graph.dim_of(in); - check_args(*t_in, dim, index, *t_out); + if (dim < 0) { + dim += in_ndim; + } - const int64_t in_size = t_in->size(dim); + VK_CHECK_COND( + dim >= 0 && dim < in_ndim, + "Dimension out of range (expected to be in range of [", + -in_ndim, + ", ", + in_ndim - 1, + "], but got ", + dim, + ")"); + + const int64_t in_size_at_dim = graph.size_at(dim, in); if (index < 0) { - index += in_size; + index += in_size_at_dim; } - std::string kernel_name; - - // for 3d tensors, these values are not used by the shader. - int32_t num_texel_per_batch = 1; - int32_t num_batches = 1; - - int64_t in_dim = t_in->dim(); - if (in_dim == 3) { - if (dim == 0) { - kernel_name = "select_channel_3d"; - } else if (dim == 1) { - kernel_name = "select_height_3d"; - } else if (dim == 2) { - kernel_name = "select_width_3d"; - } else { - VK_CHECK_COND( - false, "Unexpected dim value=", dim, "for the input 3d tensor"); - } - } else { // self.dim() == 4 - num_texel_per_batch = - static_cast(std::ceil(static_cast(t_in->size(1)) / 4)); - num_batches = t_in->size(0); - if (dim == 0) { - kernel_name = "select_batch_4d"; - } else if (dim == 1) { - kernel_name = "select_channel_4d"; - } else if (dim == 2) { - kernel_name = "select_height_4d"; - } else if (dim == 3) { - kernel_name = "select_width_4d"; - } else { + VK_CHECK_COND( + index >= 0 && index < in_size_at_dim, + "select(): index ", + index, + " out of range for tensor of size ", + in_size_at_dim, + " at dimension ", + dim); + + // Check that output tensor has correct dimensions + int64_t out_dim = graph.dim_of(out); + VK_CHECK_COND( + out_dim == in_ndim - 1, + "Output tensor dimension mismatch (expected ", + in_size_at_dim - 1, + ", but got ", + out_dim, + ")"); + + // Check that output tensor has correct sizes + int64_t out_idx = 0; + for (int64_t i = 0; i < in_size_at_dim; ++i) { + if (i != dim) { VK_CHECK_COND( - false, "Unexpected dim value=", dim, "for the input 4d tensor"); + graph.size_at(out_idx, out) == graph.size_at(i, in), + "Output size mismatch at dimension ", + out_idx, + " (expected ", + graph.size_at(i, in), + ", but got ", + graph.size_at(out_idx, out), + ")"); + out_idx++; } } +} - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, *t_out); +/** + * Adds a select operation node to the compute graph. + * + * The select operator extracts a slice from a tensor along a specified + * dimension at a given index. It effectively reduces the dimensionality of the + * input tensor by one, by selecting a single slice at the specified index along + * the given dimension. For example, if input is a 3D tensor with shape [2,3,4] + * and we select dimension 1, index 2, the output will be a 2D tensor with shape + * [2,4]. + */ +void add_select_copy_node( + ComputeGraph& graph, + const ValueRef in, + const ValueRef dim_ref, + const ValueRef index_ref, + const ValueRef out) { + check_select_args(graph, in, dim_ref, index_ref, out); - // TODO: add resizing to support dynamic shapes. - graph.execute_nodes().emplace_back(new DispatchNode( + add_transfer_copy_node( graph, - VK_KERNEL_FROM_STR(kernel_name), - graph.create_global_wg_size(out), - graph.create_local_wg_size(out), - // Inputs and Outputs - {{out, vkapi::MemoryAccessType::WRITE}, - {in, vkapi::MemoryAccessType::READ}}, - // Parameter buffers - {t_out->logical_limits_ubo(), - t_out->sizes_ubo(), - // TODO: num_batches and num_texel_per_batch are provided by - // t_out->sizes. Can change the following to reduce params - // created. - graph.create_params_buffer( - utils::make_ivec4({index, num_batches, num_texel_per_batch, 0}))}, - // Push Constants - {}, - // Specialization Constants - {}, - // Resize Args - {}, - // Resizing Logic - nullptr)); + TransferType::SELECT, + in, + dim_ref, + index_ref, + kDummyValueRef, + kDummyValueRef, + out, + {dim_ref, index_ref}, + resize_select_node); } void select_int(ComputeGraph& graph, const std::vector& args) { - return add_select_int_node(graph, args[0], args[1], args[2], args[3]); + return add_select_copy_node(graph, args[0], args[1], args[2], args[3]); } REGISTER_OPERATORS { diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp index c40e16f7c0a..67d714d10aa 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp @@ -8,12 +8,10 @@ #include -#include - #include +#include #include -#include #include #include @@ -33,127 +31,73 @@ inline int64_t normalize_idx( return normalize(index, max); } -void add_slice_tensor_copy_node( - ComputeGraph& graph, - ValueRef in, - ValueRef dim_ref, - ValueRef opt_start_ref, - ValueRef opt_end_ref, - ValueRef step_ref, - ValueRef out) { - vTensorPtr t_in = graph.get_tensor(in); - vTensorPtr t_out = graph.get_tensor(out); - - VK_CHECK_COND(check_same_packed_dim(*t_in, *t_out)); - - // Need normalize the dim - int64_t dim = graph.extract_scalar(dim_ref); - - VK_CHECK_COND( - -t_in->dim() <= dim && dim < t_in->dim(), - "dim must be in range of [-self.dim(), self.dim()), but current dim's value is ", - dim, - " and self.dim() = ", - t_in->dim()); - - dim = normalize(dim, t_in->dim()); - - DimIndex dim_index = normalize_to_dim_index(*t_in, dim); +void resize_slice_copy_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + ValueRef out_ref = args.at(0).refs.at(0); + ValueRef in_ref = args.at(1).refs.at(0); + int64_t dim = graph->extract_scalar(extra_args.at(0)); std::optional opt_start = - graph.extract_optional_scalar(opt_start_ref); + graph->extract_optional_scalar(extra_args.at(1)); std::optional opt_end = - graph.extract_optional_scalar(opt_end_ref); - int64_t step = graph.extract_scalar(step_ref); - - const auto in_sizes = t_in->sizes(); - const auto out_sizes = t_out->sizes(); - - int64_t start = opt_start.value_or(0); - int64_t end = opt_end.value_or(in_sizes[dim]); + graph->extract_optional_scalar(extra_args.at(2)); + int64_t step = graph->extract_scalar(extra_args.at(3)); - start = normalize_idx(start, in_sizes[dim], 0); - end = normalize_idx(end, in_sizes[dim], in_sizes[dim]); + // Normalize dim + if (dim < 0) { + dim += graph->dim_of(in_ref); + } - const vkapi::SpecVarList spec_vars = {t_in->packed_dim()}; + const std::vector in_sizes = graph->sizes_of(in_ref); + int64_t dim_size = in_sizes.at(dim); - const auto packed_dim_idx = - static_cast(DimIndex::DIM_LAST - t_in->packed_dim()); + int64_t start = opt_start.value_or(0); + int64_t end = opt_end.value_or(dim_size); - // if slice dim is the same as the packed dim, we can use the channel slice - if (dim_index == packed_dim_idx) { - // slice by channel - std::string kernel_name = "slice_packed_dim"; - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, *t_out); + // Normalize start and end indices + start = normalize_idx(start, dim_size, 0); + end = normalize_idx(end, dim_size, dim_size); - const struct Block final { - int offset; - int step; - } params{ - static_cast(start), - static_cast(step), - }; + // Calculate output size + std::vector new_out_sizes = in_sizes; + new_out_sizes.at(dim) = (end - start + step - 1) / step; // Ceiling division - graph.execute_nodes().emplace_back(new DispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - graph.create_global_wg_size(out), - graph.create_local_wg_size(out), - {{out, vkapi::MemoryAccessType::WRITE}, - {in, vkapi::MemoryAccessType::READ}}, - {t_out->sizes_ubo(), - t_in->sizes_ubo(), - graph.create_params_buffer(params)}, - {}, - spec_vars, - {}, - nullptr)); - - } else { - // GPU's coordinate is in x = 0, y = 1, z = 2, w = 3 - const int64_t gpu_dim = -(dim_index + 1); - // stride of input tensor's channel dimension - int64_t in_channel_stride = dim_at(in_sizes, kChannel4D); - VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step)); - - // Due to channel packing, each batch value is span over stride planes - if (dim_index == kBatch4D && packed_dim_idx == kChannel4D) { - in_channel_stride = utils::div_up_4(in_channel_stride); - } + graph->virtual_resize(out_ref, new_out_sizes); +} - std::string kernel_name = "slice_unpacked_dim"; - kernel_name.reserve(kShaderNameReserve); - add_dtype_suffix(kernel_name, *t_out); - - utils::uvec3 global_size = t_out->logical_limits(); - utils::uvec3 local_size = graph.create_local_wg_size(global_size); - - const struct Block final { - int dim; - int offset; - int step; - int stride; - } params{ - static_cast(gpu_dim), - static_cast(start), - static_cast(step), - static_cast(in_channel_stride), - }; - - graph.execute_nodes().emplace_back(new DispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - global_size, - local_size, - {{out, vkapi::MemoryAccessType::WRITE}, - {in, vkapi::MemoryAccessType::READ}}, - {t_out->sizes_ubo(), graph.create_params_buffer(params)}, - {}, - spec_vars, - {}, - nullptr)); - } +/** + * Adds a slice_copy operation node to the compute graph. + * + * The slice operator extracts a portion of a tensor along a specified + * dimension. It creates a new tensor that contains a subset of the input + * tensor's data, defined by start, end, and step parameters along the given + * dimension. + * + * For example, if input is a tensor with shape [4,5,6] and we slice along + * dimension 1 with start=1, end=4, step=2, the output will have shape [4,2,6], + * containing elements from the input at positions 1 and 3 along dimension 1. + */ +void add_slice_copy_node( + ComputeGraph& graph, + ValueRef in, + ValueRef dim_ref, + ValueRef opt_start_ref, + ValueRef opt_end_ref, + ValueRef step_ref, + ValueRef out) { + add_transfer_copy_node( + graph, + TransferType::SLICE, + in, + dim_ref, + opt_start_ref, + opt_end_ref, + step_ref, + out, + {dim_ref, opt_start_ref, opt_end_ref, step_ref}, + resize_slice_copy_node); } std::vector get_slice_sizes( @@ -186,16 +130,16 @@ void resize_slice_view_node( const std::vector& args, const std::vector& extra_args) { (void)args; - vTensorPtr out = graph->get_tensor(extra_args[0]); + ValueRef out_ref = extra_args.at(0); std::vector new_out_sizes = get_slice_sizes( *graph, - extra_args[1], // input - extra_args[2], // dim - extra_args[3], // optional start - extra_args[4]); // optional end + extra_args.at(1), // input + extra_args.at(2), // dim + extra_args.at(3), // optional start + extra_args.at(4)); // optional end - out->virtual_resize(new_out_sizes); + graph->virtual_resize(out_ref, new_out_sizes); } void check_slice_view_args( @@ -267,54 +211,54 @@ void add_slice_view_node( std::vector new_out_sizes = get_slice_sizes(graph, in_ref, dim_ref, opt_start_ref, opt_end_ref); - graph.get_tensor(out_ref)->virtual_resize(new_out_sizes); + graph.virtual_resize(out_ref, new_out_sizes); graph.execute_nodes().emplace_back(new ExecuteNode( resize_slice_view_node, {out_ref, in_ref, dim_ref, opt_start_ref, opt_end_ref, opt_step_ref})); } -void slice_tensor_copy(ComputeGraph& graph, const std::vector& args) { - return add_slice_tensor_copy_node( +void slice_copy(ComputeGraph& graph, const std::vector& args) { + return add_slice_copy_node( graph, - args[0], - args[1], // dim - args[2], // optional start - args[3], // optional end - args[4], // step - args[5]); + args.at(0), + args.at(1), // dim + args.at(2), // optional start + args.at(3), // optional end + args.at(4), // step + args.at(5)); } -void slice_tensor(ComputeGraph& graph, const std::vector& args) { - ValueRef in = args[0]; - ValueRef out = args[5]; +void slice(ComputeGraph& graph, const std::vector& args) { + ValueRef in = args.at(0); + ValueRef out = args.at(5); // Special case if out is a view of in if (graph.val_is_view_of(out, in)) { add_slice_view_node( graph, in, - args[1], // dim - args[2], // optional start - args[3], // optional end - args[4], // step + args.at(1), // dim + args.at(2), // optional start + args.at(3), // optional end + args.at(4), // step out); return; } - add_slice_tensor_copy_node( + add_slice_copy_node( graph, in, - args[1], // dim - args[2], // optional start - args[3], // optional end - args[4], // step + args.at(1), // dim + args.at(2), // optional start + args.at(3), // optional end + args.at(4), // step out); } REGISTER_OPERATORS { - VK_REGISTER_OP(aten.slice_copy.Tensor, slice_tensor_copy); - VK_REGISTER_OP(aten.slice.Tensor, slice_tensor); + VK_REGISTER_OP(aten.slice_copy.Tensor, slice_copy); + VK_REGISTER_OP(aten.slice.Tensor, slice); } } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp index f39b0fc33ff..8c060a9da4b 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp @@ -28,14 +28,14 @@ void add_staging_to_tensor_node( vkapi::ShaderInfo shader = get_nchw_to_tensor_shader( *graph.get_tensor(out_tensor), graph.int8_buffers_enabled()); - vkapi::ParamsBindList ubos; + std::vector pcs; if (graph.is_buffer_storage(out_tensor)) { - ubos.append( - {graph.sizes_ubo(out_tensor), - graph.strides_ubo(out_tensor), - graph.numel_ubo(out_tensor)}); + pcs = { + graph.sizes_pc_of(out_tensor), + graph.strides_pc_of(out_tensor), + graph.numel_pc_of(out_tensor)}; } else { - ubos.append({graph.sizes_ubo(out_tensor)}); + pcs = {graph.sizes_pc_of(out_tensor)}; } graph.execute_nodes().emplace_back(new DispatchNode( @@ -46,9 +46,9 @@ void add_staging_to_tensor_node( // Input and Outputs {{out_tensor, vkapi::kWrite}, {in_staging, vkapi::kRead}}, // Parameter Buffers - ubos, - // Push Constants {}, + // Push Constants + pcs, // Specialization Constants {graph.hashed_layout_of(out_tensor)}, // Resize Args @@ -127,14 +127,14 @@ void add_prepack_standard_node( vkapi::ShaderInfo shader = get_nchw_to_tensor_shader( *graph.get_tensor(tensor), graph.int8_buffers_enabled()); - vkapi::ParamsBindList ubos; + std::vector pcs; if (graph.is_buffer_storage(tensor)) { - ubos.append( - {graph.sizes_ubo(tensor), - graph.strides_ubo(tensor), - graph.numel_ubo(tensor)}); + pcs = { + graph.sizes_pc_of(tensor), + graph.strides_pc_of(tensor), + graph.numel_pc_of(tensor)}; } else { - ubos.append({graph.sizes_ubo(tensor)}); + pcs = {graph.sizes_pc_of(tensor)}; } int transpose_hw_spec = transpose_hw ? 1 : 0; @@ -148,9 +148,10 @@ void add_prepack_standard_node( tensor_data, tensor, // Parameter Buffers - ubos, + {}, // Specialization Constants - {graph.hashed_layout_of(tensor), transpose_hw_spec})); + {graph.hashed_layout_of(tensor), transpose_hw_spec}, + pcs)); } ValueRef prepack_standard( diff --git a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp new file mode 100644 index 00000000000..423c9789d67 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp @@ -0,0 +1,114 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include +#include +#include + +namespace vkcompute { + +/** + * Adds a transfer copy operation node to the compute graph. + * This function handles both SELECT and SLICE operations based on the + * transfer_type parameter. + */ +void add_transfer_copy_node( + ComputeGraph& graph, + TransferType transfer_type, + const ValueRef in, + const ValueRef dim_ref, + const ValueRef index_or_start_ref, + const ValueRef end_ref, + const ValueRef step_ref, + const ValueRef out, + const std::vector& resize_args, + const ExecuteNode::ResizeFunction& resize_fn) { + int64_t ndim = graph.dim_of(in); + int64_t dim = graph.extract_scalar(dim_ref); + + if (dim < 0) { + dim += ndim; + } + + int64_t dim_whcn = nchw_dim_to_whcn_dim(dim, ndim); + + vkapi::ParamsBindList param_buffers; + if (transfer_type == TransferType::SELECT) { + param_buffers = { + graph.get_or_create_int_param_buffer(index_or_start_ref, 0)}; + } else { // TransferType::SLICE + param_buffers = { + graph.get_or_create_int_param_buffer(index_or_start_ref, 0), + graph.get_or_create_int_param_buffer(step_ref, 1)}; + } + + const struct TransferParams { + const int32_t dim; + } transfer_params{static_cast(dim_whcn)}; + + std::vector push_constants; + vkapi::SpecVarList spec_vars; + + if (graph.is_buffer_storage(out)) { + push_constants = { + graph.sizes_pc_of(in), + graph.strides_pc_of(out), + graph.strides_pc_of(in), + graph.numel_pc_of(out), + PushConstantDataInfo(&transfer_params, sizeof(transfer_params))}; + + spec_vars = { + graph.packed_dim_of(out), + graph.packed_dim_of(in), + }; + } else { + push_constants = { + graph.sizes_pc_of(out), + graph.sizes_pc_of(in), + PushConstantDataInfo(&transfer_params, sizeof(transfer_params))}; + + spec_vars = { + graph.hashed_layout_of(out), + graph.hashed_layout_of(in), + }; + } + + // Determine the shader directly + std::string kernel_name; + if (transfer_type == TransferType::SELECT) { + kernel_name = "select"; + } else { // TransferType::SLICE + kernel_name = "slice"; + } + add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); + add_dtype_suffix(kernel_name, graph.dtype_of(out)); + + // Create and add the dispatch node + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + default_pick_global_wg_size, + default_pick_local_wg_size, + // Inputs and Outputs + {{out, vkapi::kWrite}, {in, vkapi::kRead}}, + // Parameter buffers + param_buffers, + // Push Constants + push_constants, + // Specialization Constants + spec_vars, + // Resize Args + resize_args, + // Resizing Logic + resize_fn)); +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Transfer.h b/backends/vulkan/runtime/graph/ops/impl/Transfer.h new file mode 100644 index 00000000000..09aae144994 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Transfer.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +namespace vkcompute { + +enum class TransferType { SELECT, SLICE }; + +/** + * Adds a transfer copy operation node to the compute graph, which implements + * operators for which each element of the output tensor maps to a unique + * element of the input tensor. + * + * This function currently handles the following operations: + * - select + * - slice + */ +void add_transfer_copy_node( + ComputeGraph& graph, + TransferType transfer_type, + const ValueRef in, + const ValueRef dim_ref, + const ValueRef index_or_start_ref, + const ValueRef end_ref, + const ValueRef step_ref, + const ValueRef out, + const std::vector& resize_args, + const ExecuteNode::ResizeFunction& resize_fn = nullptr); + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Where.cpp b/backends/vulkan/runtime/graph/ops/impl/Where.cpp new file mode 100644 index 00000000000..a3be34830d3 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Where.cpp @@ -0,0 +1,126 @@ +// Where.cpp + +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +namespace vkcompute { + +void resize_where_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + (void)extra_args; + vTensorPtr out = graph->get_tensor(args[0].refs[0]); + vTensorPtr in = graph->get_tensor(args[1].refs[0]); + + std::vector in_sizes = in->sizes(); + out->virtual_resize(in_sizes); +} + +void add_where_texture_node( + ComputeGraph& graph, + const ValueRef cond, + const ValueRef self, + const ValueRef other, + const ValueRef out) { + std::string kernel_name = "where"; + + add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); + add_dtype_suffix(kernel_name, graph.dtype_of(out)); + + const utils::uvec3 global_wg_size = graph.create_global_wg_size(out); + const utils::uvec3 local_wg_size = graph.create_local_wg_size(global_wg_size); + + graph.execute_nodes().emplace_back(new DispatchNode( + graph, + // Shader + VK_KERNEL_FROM_STR(kernel_name), + // Workgroup sizes + global_wg_size, + local_wg_size, + // Inputs and Outputs + {{out, vkapi::kWrite}, {{cond, self, other}, vkapi::kRead}}, + // Parameter buffers + {graph.logical_limits_ubo(self)}, + // Push Constants + {}, + // Specialization Constants + {graph.packed_dim_of(out)}, + // Resize Arguments + {}, + // Resizing Logic + resize_where_node)); +} + +void add_where_buffer_node( + ComputeGraph& graph, + const ValueRef cond, + const ValueRef self, + const ValueRef other, + const ValueRef out) { + std::string kernel_name = "where"; + + add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); + add_dtype_suffix(kernel_name, graph.dtype_of(out)); + + const utils::uvec3 global_wg_size = graph.create_global_wg_size(out); + const utils::uvec3 local_wg_size = graph.create_local_wg_size(global_wg_size); + + vkapi::ParamsBindList ubos = { + graph.numel_ubo(out), + graph.strides_ubo(out), + graph.strides_ubo(cond), + graph.strides_ubo(self), + graph.strides_ubo(other)}; + + graph.execute_nodes().emplace_back(new DispatchNode( + graph, + // Shader + VK_KERNEL_FROM_STR(kernel_name), + // Workgroup sizes + global_wg_size, + local_wg_size, + // Inputs and Outputs + {{out, vkapi::kWrite}, {{cond, self, other}, vkapi::kRead}}, + // Parameter buffers + ubos, + // Push Constants + {}, + // Specialization Constants + {graph.packed_dim_of(out), + graph.packed_dim_of(cond), + graph.packed_dim_of(self), + graph.packed_dim_of(other)}, + // Resize Arguments + {}, + // Resizing Logic + resize_where_node)); +} + +void where(ComputeGraph& graph, const std::vector& args) { + int args_i = 0; + const ValueRef cond = args[args_i++]; + const ValueRef self = args[args_i++]; + const ValueRef other = args[args_i++]; + const ValueRef out = args[args_i++]; + if (graph.is_buffer_storage(out)) { + add_where_buffer_node(graph, cond, self, other, out); + } else { + add_where_texture_node(graph, cond, self, other, out); + } +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(aten.where.self, where); +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp index 469c2ed8280..e1ac4e9d40a 100644 --- a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp @@ -49,6 +49,7 @@ void add_dtype_suffix(std::string& kernel_name, const vkapi::ScalarType dtype) { break; case vkapi::kByte: case vkapi::kQUInt8: + case vkapi::kBool: kernel_name += "_uint8"; break; default: diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp index fd7e6b78c22..6f3660fb0fc 100644 --- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp @@ -22,13 +22,17 @@ bool is_bitw8(vkapi::ScalarType dtype) { vkapi::ShaderInfo get_nchw_to_tensor_shader( const api::vTensor& v_dst, - const bool int8_buffer_enabled) { + bool int8_buffer_enabled, + bool push_constant_variant) { std::string kernel_name; kernel_name.reserve(kShaderNameReserve); if (is_bitw8(v_dst.dtype()) && v_dst.storage_type() != utils::kBuffer && !int8_buffer_enabled) { kernel_name = "nchw_to_bitw8_image_nobitw8buffer"; + if (!push_constant_variant) { + kernel_name += "_no_pc"; + } add_storage_type_suffix(kernel_name, v_dst); add_dtype_suffix(kernel_name, v_dst); return VK_KERNEL_FROM_STR(kernel_name); @@ -36,11 +40,17 @@ vkapi::ShaderInfo get_nchw_to_tensor_shader( if (v_dst.storage_type() == utils::kBuffer) { kernel_name = "nchw_to_buffer"; + if (!push_constant_variant) { + kernel_name += "_no_pc"; + } add_dtype_suffix(kernel_name, v_dst); return VK_KERNEL_FROM_STR(kernel_name); } kernel_name = "nchw_to_image"; + if (!push_constant_variant) { + kernel_name += "_no_pc"; + } add_storage_type_suffix(kernel_name, v_dst); add_dtype_suffix(kernel_name, v_dst); diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h index 8d63958a738..6abbac45823 100644 --- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h +++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h @@ -14,7 +14,8 @@ namespace vkcompute { vkapi::ShaderInfo get_nchw_to_tensor_shader( const api::vTensor& v_dst, - bool int8_buffer_enabled = true); + bool int8_buffer_enabled = true, + bool push_constant_variant = true); vkapi::ShaderInfo get_tensor_to_nchw_shader( const api::vTensor& v_src, bool int8_buffer_enabled = true); diff --git a/backends/vulkan/runtime/vk_api/Types.h b/backends/vulkan/runtime/vk_api/Types.h index 7191409c215..6531bf4710c 100644 --- a/backends/vulkan/runtime/vk_api/Types.h +++ b/backends/vulkan/runtime/vk_api/Types.h @@ -27,7 +27,7 @@ _(uint8_t, VK_FORMAT_R8G8B8A8_UINT, Byte) \ _(int8_t, VK_FORMAT_R8G8B8A8_SINT, Char) \ _(int32_t, VK_FORMAT_R32G32B32A32_SINT, Int) \ - _(bool, VK_FORMAT_R8G8B8A8_SINT, Bool) \ + _(uint8_t, VK_FORMAT_R8G8B8A8_UINT, Bool) \ _(uint16_t, VK_FORMAT_R16G16B16A16_SFLOAT, Half) \ _(float, VK_FORMAT_FLOAT4, Float) \ _(int8_t, VK_FORMAT_R8G8B8A8_SINT, QInt8) \ diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py index 4a12f16bbf9..bf6e9683ef7 100644 --- a/backends/vulkan/test/op_tests/cases.py +++ b/backends/vulkan/test/op_tests/cases.py @@ -499,7 +499,9 @@ def get_ones_inputs(): def get_select_int_inputs(): test_suite = VkTestSuite( [ - ((6, 2, 7), 0, 3), + ((8, 8, 8), 0, -2), + ((8, 8, 8), 1, -3), + ((8, 8, 8), 2, -4), ((6, 2, 7), 1, 0), ((6, 2, 7), 2, 3), ((6, 10, 7), 0, 3), @@ -515,6 +517,10 @@ def get_select_int_inputs(): ((8, 6, 1, 1), 1, 4), ] ) + test_suite.layouts = ["utils::kWidthPacked", "utils::kChannelsPacked"] + test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"] + test_suite.dtypes = ["at::kFloat"] + test_suite.data_gen = "make_seq_tensor" return test_suite @@ -1147,6 +1153,7 @@ def get_reduce_op_inputs(): "aten.hardsigmoid.default", "aten.leaky_relu.default", "aten.round.default", + "aten.tan.default", ] ) def get_unary_ops_inputs(): @@ -1349,3 +1356,28 @@ def get_flip_inputs(): test_suite = VkTestSuite([tuple(tc) for tc in test_cases]) return test_suite + + +@register_test_suite("aten.where.self") +def get_where_inputs(): + Test = namedtuple("Where", ["condition", "self", "other"]) + Test.__new__.__defaults__ = (None, None, None) + + test_cases = [ + Test(condition=[11], self=[11], other=[11]), + Test(condition=[10, 9], self=[10, 9], other=[10, 9]), + Test(condition=[10, 5, 3], self=[10, 5, 3], other=[10, 5, 3]), + Test(condition=[2, 10, 5, 3], self=[2, 10, 5, 3], other=[2, 10, 5, 3]), + ] + + test_suite = VkTestSuite([tuple(tc) for tc in test_cases]) + test_suite.arg_dtype["condition"] = "at::kBool" + test_suite.layouts = [ + "utils::kWidthPacked", + "utils::kHeightPacked", + "utils::kChannelsPacked", + ] + test_suite.storage_types = ["utils::kTexture3D", "utils::kBuffer"] + test_suite.atol = "1e-4" + test_suite.rtol = "1e-4" + return test_suite diff --git a/backends/vulkan/test/op_tests/utils/gen_correctness_base.py b/backends/vulkan/test/op_tests/utils/gen_correctness_base.py index e6ce135736b..5be4ddba6bf 100644 --- a/backends/vulkan/test/op_tests/utils/gen_correctness_base.py +++ b/backends/vulkan/test/op_tests/utils/gen_correctness_base.py @@ -282,12 +282,16 @@ def generate_suite_cpp(self) -> str: at::ScalarType dtype = at::kFloat, float low = 0.0, float high = 1.0) {{ - if (high == 1.0 && low == 0.0) - return at::rand(sizes, at::device(at::kCPU).dtype(dtype)); if (dtype == at::kChar) return at::randint(high, sizes, at::device(at::kCPU).dtype(dtype)); + if (dtype == at::kBool) + return at::rand(sizes, at::device(at::kCPU)) > 0.5; + + if (high == 1.0 && low == 0.0) + return at::rand(sizes, at::device(at::kCPU).dtype(dtype)); + return at::rand(sizes, at::device(at::kCPU).dtype(dtype)) * (high - low) + low; }} diff --git a/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py b/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py index 6c165a777db..ce6ab32ce60 100644 --- a/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py +++ b/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py @@ -119,6 +119,8 @@ def gen_parameterization(self) -> str: return vkapi::kInt; case c10::kChar: return vkapi::kChar; + case c10::kBool: + return vkapi::kBool; default: VK_THROW("Unsupported at::ScalarType!"); } diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py index 80ead02de9f..447e5d039f4 100644 --- a/backends/vulkan/test/test_vulkan_delegate.py +++ b/backends/vulkan/test/test_vulkan_delegate.py @@ -1842,3 +1842,50 @@ def forward(self, x): dynamic_shapes=dynamic_shapes, test_inputs=test_inputs, ) + + def test_select_last_height_dynamic_shapes(self): + """ + Test selecting the last element along the height dimension with dynamic shapes. + The height dimension (dim=1) is variable. + """ + + class SelectLastHeightModule(torch.nn.Module): + """ + Module that selects the last element along the height dimension (dim=1) of a 3D tensor. + This is equivalent to the operation: x[:, -1, :] + """ + + def __init__(self): + super().__init__() + + def forward(self, x): + # Select the last element along dimension 1 (height) + return x[:, -1, :] + + # Create the module + module = SelectLastHeightModule() + + # Create sample inputs with a specific shape + # Shape: [batch_size, height, width] + sample_inputs = (torch.arange(1, 61).reshape(2, 10, 3).float(),) + + # Define dynamic shapes for the height dimension + height = Dim("height", min=1, max=10) + dynamic_shapes = {"x": {1: height}} + + # Create test inputs with different heights + test_inputs = [ + (torch.arange(1, 7).reshape(2, 1, 3).float(),), # Minimum height + (torch.arange(1, 19).reshape(2, 3, 3).float(),), # Small height + (torch.arange(1, 43).reshape(2, 7, 3).float(),), # Medium height + (torch.arange(1, 31).reshape(2, 5, 3).float(),), # Maximum height + ] + + # Use the testing infrastructure from TestVulkanBackend + test_backend = TestVulkanBackend() + test_backend.lower_module_and_test_output( + module, + sample_inputs, + dynamic_shapes=dynamic_shapes, + test_inputs=test_inputs, + ) diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp index c4acb41b7b0..dcd8c425d62 100644 --- a/backends/vulkan/test/utils/test_utils.cpp +++ b/backends/vulkan/test/utils/test_utils.cpp @@ -28,7 +28,7 @@ void record_nchw_to_buffer_op( vkapi::PipelineBarrier pipeline_barrier{}; context->submit_compute_job( - get_nchw_to_tensor_shader(v_dst), + get_nchw_to_tensor_shader(v_dst, true, false), pipeline_barrier, {uint32_t(v_dst.numel()), 1, 1}, {64, 1, 1}, @@ -74,7 +74,9 @@ void record_nchw_to_image_op( context->submit_compute_job( get_nchw_to_tensor_shader( - v_dst, context->adapter_ptr()->has_full_int8_buffers_support()), + v_dst, + context->adapter_ptr()->has_full_int8_buffers_support(), + false), pipeline_barrier, v_dst.logical_limits(), adaptive_work_group_size(v_dst.logical_limits()), diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index a6475d95d07..85811aaaf11 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -1600,8 +1601,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { /*shared_object_idx = */ 4); // +2: t.sizes_ubo() for each staging shader - // +2: staging buffer for each input tensor - expected_vma_allocation_count += 4; + expected_vma_allocation_count += 2; EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); ValueRef c = graph.add_tensor( @@ -1621,8 +1621,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { /*shared_object_idx = */ 2); // +1: t.sizes_ubo() uniform buffer for staging shader - // +1: staging buffer for the input tensor - expected_vma_allocation_count += 2; + expected_vma_allocation_count += 1; EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); ValueRef e = graph.add_tensor( @@ -1660,9 +1659,8 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { for (auto& new_sizes : new_sizes_list) { graph.get_tensor(a.value)->virtual_resize(new_sizes); graph.get_tensor(b.value)->virtual_resize(new_sizes); - graph.get_tensor(c)->virtual_resize(new_sizes); graph.get_tensor(d.value)->virtual_resize(new_sizes); - graph.get_tensor(e)->virtual_resize(new_sizes); + graph.propagate_resize(); float val_a = new_sizes[1] + 4.0f; float val_b = new_sizes[2] + 1.5f; @@ -3315,17 +3313,23 @@ vkapi::ShaderInfo pick_dynamic_dispatch_shader( utils::uvec3 pick_dynamic_dispatch_global_wg_size( ComputeGraph* graph, + const vkapi::ShaderInfo& shader, const std::vector& args, - const std::vector& additional_args) { + const std::vector& resize_args) { + (void)shader; const ValueRef out = args[0].refs[0]; - return graph->logical_limits_of(out); } utils::uvec3 pick_dynamic_dispatch_local_wg_size( ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const utils::uvec3& global_workgroup_size, const std::vector& args, - const std::vector& additional_args) { + const std::vector& resize_args) { + (void)graph; + (void)shader; + (void)global_workgroup_size; return {64, 1, 1}; } diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt index a1fee7427fc..b6ba211ecb3 100644 --- a/backends/xnnpack/CMakeLists.txt +++ b/backends/xnnpack/CMakeLists.txt @@ -61,7 +61,10 @@ foreach(fbs_file ${_xnnpack_schema__srcs}) endforeach() if(WIN32) - set(MV_COMMAND powershell -Command "Move-Item -Path ${_xnnpack_flatbuffer__outputs} -Destination ${_xnnpack_schema__outputs}") + set(MV_COMMAND + powershell -Command + "Move-Item -Path ${_xnnpack_flatbuffer__outputs} -Destination ${_xnnpack_schema__outputs}" + ) else() set(MV_COMMAND mv ${_xnnpack_flatbuffer__outputs} ${_xnnpack_schema__outputs}) endif() @@ -96,7 +99,8 @@ include(cmake/Dependencies.cmake) list(TRANSFORM _xnnpack_backend__srcs PREPEND "${EXECUTORCH_ROOT}/") add_library(xnnpack_backend ${_xnnpack_backend__srcs}) target_link_libraries( - xnnpack_backend PUBLIC ${xnnpack_third_party} executorch_core xnnpack_schema extension_threadpool + xnnpack_backend PUBLIC ${xnnpack_third_party} executorch_core xnnpack_schema + extension_threadpool ) target_include_directories( @@ -114,46 +118,8 @@ target_include_directories( target_compile_options(xnnpack_backend PUBLIC ${_common_compile_options}) target_link_options_shared_lib(xnnpack_backend) -if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) - list(APPEND xnn_executor_runner_libs optimized_native_cpu_ops_lib) -else() - list(APPEND xnn_executor_runner_libs portable_ops_lib) -endif() - -if(EXECUTORCH_BUILD_KERNELS_CUSTOM) - list(APPEND xnn_executor_runner_libs $) -endif() - -if(EXECUTORCH_BUILD_KERNELS_QUANTIZED) - list(APPEND xnn_executor_runner_libs quantized_ops_lib) -endif() - -list(APPEND xnn_executor_runner_libs xnnpack_backend executorch) - -# ios can only build library but not binary -if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$") - # - # xnn_executor_runner: Like executor_runner but with XNNPACK, the binary will - # be at ${CMAKE_BINARY_DIR}/backends/xnnpack - # - list(TRANSFORM _xnn_executor_runner__srcs PREPEND "${EXECUTORCH_ROOT}/") - add_executable(xnn_executor_runner ${_xnn_executor_runner__srcs}) - - if(EXECUTORCH_ENABLE_EVENT_TRACER) - list(APPEND xnn_executor_runner_libs etdump) - endif() - - target_link_libraries(xnn_executor_runner gflags ${xnn_executor_runner_libs}) - target_compile_options(xnn_executor_runner PUBLIC ${_common_compile_options}) - if(EXECUTORCH_BUILD_PTHREADPOOL) - target_link_libraries(xnn_executor_runner extension_threadpool pthreadpool) - target_compile_definitions(xnn_executor_runner PRIVATE ET_USE_THREADPOOL) - endif() -endif() - install( TARGETS xnnpack_backend - DESTINATION lib INCLUDES DESTINATION ${_common_include_directories} ) diff --git a/backends/xnnpack/README.md b/backends/xnnpack/README.md index 2328f8e4b90..411bec99d79 100644 --- a/backends/xnnpack/README.md +++ b/backends/xnnpack/README.md @@ -92,7 +92,7 @@ After lowering to the XNNPACK Program, we can then prepare it for executorch and ### Running the XNNPACK Model with CMake -After exporting the XNNPACK Delegated model, we can now try running it with example inputs using CMake. We can build and use the xnn_executor_runner, which is a sample wrapper for the ExecuTorch Runtime and XNNPACK Backend. We first begin by configuring the CMake build like such: +After exporting the XNNPACK Delegated model, we can now try running it with example inputs using CMake. We can build and use the executor_runner, which is a sample wrapper for the ExecuTorch Runtime. The XNNPACK Backend is enabled via the compilation flag `-DEXECUTORCH_BUILD_XNNPACK=ON`. We first begin by configuring the CMake build like such: ```bash # cd to the root of executorch repo cd executorch @@ -119,9 +119,9 @@ Then you can build the runtime componenets with cmake --build cmake-out -j9 --target install --config Release ``` -Now you should be able to find the executable built at `./cmake-out/backends/xnnpack/xnn_executor_runner` you can run the executable with the model you generated as such +Now you should be able to find the executable built at `./cmake-out/executor_runner` you can run the executable with the model you generated as such ```bash -./cmake-out/backends/xnnpack/xnn_executor_runner --model_path=./mv2_xnnpack_fp32.pte +./cmake-out/executor_runner --model_path=./mv2_xnnpack_fp32.pte ``` ## Help & Improvements diff --git a/backends/xnnpack/operators/node_visitor.py b/backends/xnnpack/operators/node_visitor.py index 8470184d808..b7d16b18bd1 100644 --- a/backends/xnnpack/operators/node_visitor.py +++ b/backends/xnnpack/operators/node_visitor.py @@ -274,19 +274,46 @@ def get_per_channel_dtype( return dtype - def get_quant_params(self, quant_params: QuantParams) -> XNNQuantParams: + def get_quant_params( + self, quant_params: QuantParams, xnn_graph: XNNGraph + ) -> XNNQuantParams: if quant_params.per_channel: scale = cast(torch.Tensor, quant_params.scale) + buffer_idx = len(xnn_graph.constant_data) + num_scales = scale.numel() + + if quant_params.is_per_channel_group: + scale = scale.to(torch.bfloat16) + + num_bytes = scale.untyped_storage().nbytes() + scale_array = ctypes.cast( + scale.untyped_storage().data_ptr(), + ctypes.POINTER(ctypes.c_char * num_bytes), + ).contents + scale_name = hashlib.sha256(bytes(scale_array)).hexdigest() + xnn_graph.constant_data.append( + ConstantDataOffset( + offset=UINT64_MAX, size=num_bytes, named_key=scale_name + ) + ) + self._named_data_store.add_named_data( + scale_name, bytes(scale_array), CONSTANT_TENSOR_ALIGNMENT + ) + if quant_params.is_per_channel_group: return PerChannelGroupQuant( - scale=scale.flatten().tolist(), + scale=[], channel_dim=quant_params.axis, group_size=quant_params.group_size, + scale_buffer_idx=buffer_idx, + num_scales=num_scales, ) - else: # per_channel quant + else: return PerChannelQuant( - scale=scale.tolist(), + scale=[], channel_dim=quant_params.axis, + scale_buffer_idx=buffer_idx, + num_scales=num_scales, ) elif quant_params.is_dynamic: # NB: @@ -449,7 +476,7 @@ def define_tensor( # noqa: C901 else XValue( xvalue_union=XNNQuantizedTensorValue( tensor_value=tvalue, - quant_params=self.get_quant_params(quant_params), + quant_params=self.get_quant_params(quant_params, xnn_graph), ) ) ) diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp index 9fd2c55bb83..56d0508bef0 100644 --- a/backends/xnnpack/runtime/XNNCompiler.cpp +++ b/backends/xnnpack/runtime/XNNCompiler.cpp @@ -421,11 +421,32 @@ Error defineTensor( qparams->channel_dim(), dtype, zero_point); + + const float* scale = qparams->scale()->data(); + + if (qparams->scale_buffer_idx() != 0) { + // if scales are stored in named data, then retrieve it + ConstantDataOffsetPtr scale_buffer_offset = + flatbuffer_graph->constant_data()->Get( + qparams->scale_buffer_idx()); + const std::string& data_name = + scale_buffer_offset->named_key()->str(); + Result scale_buffer = + named_data_map->get_data(data_name.c_str()); + ET_CHECK_OR_RETURN_ERROR( + scale_buffer.ok(), + Internal, + "Failed to get constant data for key %s from named_data_map. Error code: %u", + data_name.c_str(), + static_cast(scale_buffer.error())); + scale = reinterpret_cast(scale_buffer.get().data()); + freeable_buffers.push_back(std::move(scale_buffer.get())); + } status = xnn_define_channelwise_quantized_tensor_value_v2( /*subgraph=*/subgraph_ptr, /*datatype=*/dtype, /*zero_point=*/zero_point, - /*scale=*/qparams->scale()->data(), + /*scale=*/scale, /*num_dims=*/tensor_value->num_dims(), /*channel_dim*/ qparams->channel_dim(), /*dims=*/dims_data.data(), @@ -452,10 +473,24 @@ Error defineTensor( // Block scales are preferably serialized as bf16 but can also be // serialized as fp32 for backwards compatability. - if (qparams->scale_bf16() != nullptr) { + if (qparams->scale_buffer_idx() != 0) { + ConstantDataOffsetPtr scale_buffer_offset = + flatbuffer_graph->constant_data()->Get( + qparams->scale_buffer_idx()); + const std::string& data_name = + scale_buffer_offset->named_key()->str(); + Result scale_buffer = + named_data_map->get_data(data_name.c_str()); + ET_CHECK_OR_RETURN_ERROR( + scale_buffer.ok(), + Internal, + "Failed to get constant data for key %s from named_data_map. Error code: %u", + data_name.c_str(), + static_cast(scale_buffer.error())); scale_data = - static_cast(qparams->scale_bf16()->data()); - scale_numel = qparams->scale_bf16()->size(); + reinterpret_cast(scale_buffer.get().data()); + freeable_buffers.push_back(std::move(scale_buffer.get())); + scale_numel = qparams->num_scales(); } else { // Read fp32 scales, convert to bf16. auto conv_buffer = static_cast(allocator.allocateTemporary( diff --git a/backends/xnnpack/serialization/runtime_schema.fbs b/backends/xnnpack/serialization/runtime_schema.fbs index 79502ad4e51..d76c3c0807e 100644 --- a/backends/xnnpack/serialization/runtime_schema.fbs +++ b/backends/xnnpack/serialization/runtime_schema.fbs @@ -48,6 +48,8 @@ table Buffer { table PerChannelQuant { scale:[float]; channel_dim:int; + scale_buffer_idx: uint; + num_scales: uint; } table PerTokenDynamicQuant { @@ -63,7 +65,9 @@ table PerChannelGroupQuant { scale:[float]; channel_dim:int; group_size:int; - scale_bf16:[ushort]; + scale_bf16:[ushort] (deprecated); + scale_buffer_idx: uint; + num_scales: uint; } table XNNTensorValue { diff --git a/backends/xnnpack/serialization/schema.fbs b/backends/xnnpack/serialization/schema.fbs index a231ed05c5d..356df663dfc 100644 --- a/backends/xnnpack/serialization/schema.fbs +++ b/backends/xnnpack/serialization/schema.fbs @@ -48,12 +48,16 @@ table PerChannelGroupQuant { scale:[float]; channel_dim:int; group_size:int; - scale_bf16:[ushort]; + scale_bf16:[ushort] (deprecated); + scale_buffer_idx: uint; + num_scales: uint; } table PerChannelQuant { scale:[float]; channel_dim:int; + scale_buffer_idx: uint; + num_scales: uint; } table PerTokenDynamicQuant { diff --git a/backends/xnnpack/serialization/xnnpack_graph_schema.py b/backends/xnnpack/serialization/xnnpack_graph_schema.py index 3a39fe98279..b8b4ea7f02f 100644 --- a/backends/xnnpack/serialization/xnnpack_graph_schema.py +++ b/backends/xnnpack/serialization/xnnpack_graph_schema.py @@ -425,6 +425,13 @@ class XNNDatatype(IntEnum): class PerChannelQuant: scale: List[float] channel_dim: int + scale_buffer_idx: int = -1 + num_scales: int = -1 + + +@dataclass +class Buffer: + storage: bytes @dataclass @@ -432,6 +439,9 @@ class PerChannelGroupQuant: scale: List[float] channel_dim: int group_size: int = 1 + scale_bf16: Optional[List[float]] = None + scale_buffer_idx: int = -1 + num_scales: int = -1 @dataclass diff --git a/backends/xnnpack/third-party/cpuinfo b/backends/xnnpack/third-party/cpuinfo index 1e83a2fdd31..c61fe919607 160000 --- a/backends/xnnpack/third-party/cpuinfo +++ b/backends/xnnpack/third-party/cpuinfo @@ -1 +1 @@ -Subproject commit 1e83a2fdd3102f65c6f1fb602c1b320486218a99 +Subproject commit c61fe919607bbc534d7a5a5707bdd7041e72c5ff diff --git a/codegen/api/et_cpp.py b/codegen/api/et_cpp.py index 5703af89c5a..88f1eb83fe0 100644 --- a/codegen/api/et_cpp.py +++ b/codegen/api/et_cpp.py @@ -2,15 +2,6 @@ from typing import TYPE_CHECKING -from executorch.codegen.api.types import ( - ArrayRefCType, - BaseTypeToCppMapping, - OptionalCType, - scalarT, - tensorListT, - tensorT, -) - from torchgen import local from torchgen.api.types import ( ArgName, @@ -40,6 +31,15 @@ ) from typing_extensions import assert_never +from .types import ( + ArrayRefCType, + BaseTypeToCppMapping, + OptionalCType, + scalarT, + tensorListT, + tensorT, +) + if TYPE_CHECKING: from collections.abc import Sequence diff --git a/codegen/api/types/__init__.py b/codegen/api/types/__init__.py index 9de50ae744a..628c0637ced 100644 --- a/codegen/api/types/__init__.py +++ b/codegen/api/types/__init__.py @@ -1,5 +1,5 @@ # flake8: noqa: F403, F401 -from executorch.codegen.api.types.types import * +from .types import * # flake8: noqa: F403, F401 -from executorch.codegen.api.types.signatures import * # usort: skip +from .signatures import * # usort: skip diff --git a/codegen/api/types/signatures.py b/codegen/api/types/signatures.py index 0b41b227c4e..6342a3f7a5e 100644 --- a/codegen/api/types/signatures.py +++ b/codegen/api/types/signatures.py @@ -4,7 +4,8 @@ from typing import TYPE_CHECKING import torchgen.api.cpp as aten_cpp -from executorch.codegen.api.types.types import contextArg + +from .types import contextArg if TYPE_CHECKING: @@ -73,4 +74,4 @@ def from_native_function( ) -from executorch.codegen.api import et_cpp +from .. import et_cpp diff --git a/codegen/gen.py b/codegen/gen.py index 43dc296a317..0dc1a167712 100644 --- a/codegen/gen.py +++ b/codegen/gen.py @@ -8,15 +8,32 @@ from typing import Any, Callable, TextIO, TYPE_CHECKING import yaml -from executorch.codegen.api import et_cpp -from executorch.codegen.api.custom_ops import ( - ComputeNativeFunctionStub, - gen_custom_ops_registration, -) -from executorch.codegen.api.types import contextArg, ExecutorchCppSignature -from executorch.codegen.api.unboxing import Unboxing -from executorch.codegen.model import ETKernelIndex, ETKernelKey, ETParsedYaml -from executorch.codegen.parse import ET_FIELDS, parse_et_yaml, parse_et_yaml_struct + +try: + from executorch.codegen.api import et_cpp + from executorch.codegen.api.custom_ops import ( + ComputeNativeFunctionStub, + gen_custom_ops_registration, + ) + from executorch.codegen.api.types import contextArg, ExecutorchCppSignature + from executorch.codegen.api.unboxing import Unboxing + from executorch.codegen.model import ETKernelIndex, ETKernelKey, ETParsedYaml + from executorch.codegen.parse import ET_FIELDS, parse_et_yaml, parse_et_yaml_struct +except ImportError: + # If we build from source, executorch.codegen is not available. + from .api import et_cpp # type: ignore[no-redef] + from .api.custom_ops import ( # type: ignore + ComputeNativeFunctionStub, + gen_custom_ops_registration, + ) + from .api.types import contextArg, ExecutorchCppSignature # type: ignore + from .api.unboxing import Unboxing # type: ignore + from .model import ETKernelIndex, ETKernelKey, ETParsedYaml # type: ignore + from .parse import ( # type: ignore[no-redef] + ET_FIELDS, + parse_et_yaml, + parse_et_yaml_struct, + ) # Parse native_functions.yaml into a sequence of NativeFunctions and Backend Indices. from torchgen import dest diff --git a/codegen/test/test_executorch_custom_ops.py b/codegen/test/test_executorch_custom_ops.py index 847f87ab352..67dccc3a8c1 100644 --- a/codegen/test/test_executorch_custom_ops.py +++ b/codegen/test/test_executorch_custom_ops.py @@ -15,8 +15,8 @@ import torchgen from executorch.codegen.api.custom_ops import ComputeNativeFunctionStub +from executorch.codegen.gen import gen_headers from executorch.codegen.model import ETKernelIndex -from torchgen.gen_executorch import gen_headers from torchgen.model import Location, NativeFunction from torchgen.selective_build.selector import SelectiveBuilder from torchgen.utils import FileManager diff --git a/codegen/test/test_executorch_gen.py b/codegen/test/test_executorch_gen.py index 23dcbecf64a..30c82254de7 100644 --- a/codegen/test/test_executorch_gen.py +++ b/codegen/test/test_executorch_gen.py @@ -11,15 +11,15 @@ import unittest import yaml - -from executorch.codegen.model import ETKernelIndex, ETKernelKey -from torchgen.gen import LineLoader -from torchgen.gen_executorch import ( +from executorch.codegen.gen import ( ComputeCodegenUnboxedKernels, gen_functions_declarations, parse_yaml_files, translate_native_yaml, ) + +from executorch.codegen.model import ETKernelIndex, ETKernelKey +from torchgen.gen import LineLoader from torchgen.model import ( BackendIndex, BackendMetadata, diff --git a/codegen/tools/gen_oplist.py b/codegen/tools/gen_oplist.py index b1f4af02889..3d26797fb24 100644 --- a/codegen/tools/gen_oplist.py +++ b/codegen/tools/gen_oplist.py @@ -20,6 +20,7 @@ # We can use relative import instead. from ..parse import strip_et_fields + from torchgen.gen import LineLoader, parse_native_yaml_struct from torchgen.selective_build.operator import SelectiveBuildOperator from torchgen.selective_build.selector import merge_et_kernel_metadata diff --git a/devtools/etrecord/_etrecord.py b/devtools/etrecord/_etrecord.py index de7cf93990a..d5ad81fe255 100644 --- a/devtools/etrecord/_etrecord.py +++ b/devtools/etrecord/_etrecord.py @@ -29,6 +29,7 @@ from executorch.exir.serde.export_serialize import SerializedArtifact from executorch.exir.serde.serialize import deserialize, serialize +ProgramInput = List[Value] ProgramOutput = List[Value] try: @@ -49,6 +50,7 @@ class ETRecordReservedFileNames(StrEnum): DEBUG_HANDLE_MAP_NAME = "debug_handle_map" DELEGATE_MAP_NAME = "delegate_map" REFERENCE_OUTPUTS = "reference_outputs" + REPRESENTATIVE_INPUTS = "representative_inputs" @dataclass @@ -60,6 +62,7 @@ class ETRecord: Dict[str, Dict[int, Dict[str, Union[str, _DelegateDebugIdentifierMap]]]] ] = None _reference_outputs: Optional[Dict[str, List[ProgramOutput]]] = None + _representative_inputs: Optional[List[ProgramOutput]] = None def _handle_exported_program( @@ -157,6 +160,24 @@ def _get_reference_outputs( return reference_outputs +def _get_representative_inputs( + bundled_program: BundledProgram, +) -> List[ProgramInput]: + """ + Extracts out the inputs from the bundled program, keyed by the method names. + """ + for method_test_suite in bundled_program.method_test_suites: + if method_test_suite.method_name == "forward": + if not method_test_suite.test_cases: + raise ValueError( + "The 'forward' method is defined, but no corresponding input test cases are provided." + ) + # Get first example input from the forward method + test_case = method_test_suite.test_cases[0] + return test_case.inputs + raise ValueError("No 'forward' method found in the bundled program.") + + def generate_etrecord( et_record: Union[str, os.PathLike, BinaryIO, IO[bytes]], edge_dialect_program: Union[EdgeProgramManager, ExirExportedProgram], @@ -244,6 +265,13 @@ def generate_etrecord( # @lint-ignore PYTHONPICKLEISBAD pickle.dumps(reference_outputs), ) + + representative_inputs = _get_representative_inputs(executorch_program) + etrecord_zip.writestr( + ETRecordReservedFileNames.REPRESENTATIVE_INPUTS, + # @lint-ignore PYTHONPICKLEISBAD + pickle.dumps(representative_inputs), + ) executorch_program = executorch_program.executorch_program etrecord_zip.writestr( @@ -290,6 +318,7 @@ def parse_etrecord(etrecord_path: str) -> ETRecord: # noqa: C901 delegate_map = None edge_dialect_program = None reference_outputs = None + representative_inputs = None serialized_exported_program_files = set() serialized_state_dict_files = set() @@ -321,6 +350,11 @@ def parse_etrecord(etrecord_path: str) -> ETRecord: # noqa: C901 reference_outputs = pickle.loads( etrecord_zip.read(ETRecordReservedFileNames.REFERENCE_OUTPUTS) ) + elif entry == ETRecordReservedFileNames.REPRESENTATIVE_INPUTS: + # @lint-ignore PYTHONPICKLEISBAD + representative_inputs = pickle.loads( + etrecord_zip.read(ETRecordReservedFileNames.REPRESENTATIVE_INPUTS) + ) else: if entry.endswith("state_dict"): serialized_state_dict_files.add(entry) @@ -352,4 +386,5 @@ def parse_etrecord(etrecord_path: str) -> ETRecord: # noqa: C901 _debug_handle_map=debug_handle_map, _delegate_map=delegate_map, _reference_outputs=reference_outputs, + _representative_inputs=representative_inputs, ) diff --git a/devtools/etrecord/tests/etrecord_test.py b/devtools/etrecord/tests/etrecord_test.py index cf50662c2a1..dd1d40e0292 100644 --- a/devtools/etrecord/tests/etrecord_test.py +++ b/devtools/etrecord/tests/etrecord_test.py @@ -19,6 +19,7 @@ from executorch.devtools.etrecord import generate_etrecord, parse_etrecord from executorch.devtools.etrecord._etrecord import ( _get_reference_outputs, + _get_representative_inputs, ETRecordReservedFileNames, ) from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge @@ -135,15 +136,25 @@ def test_etrecord_generation_with_bundled_program(self): ) etrecord = parse_etrecord(tmpdirname + "/etrecord.bin") - expected = etrecord._reference_outputs - actual = _get_reference_outputs(bundled_program) + expected_inputs = etrecord._representative_inputs + actual_inputs = _get_representative_inputs(bundled_program) # assertEqual() gives "RuntimeError: Boolean value of Tensor with more than one value is ambiguous" when comparing tensors, # so we use torch.equal() to compare the tensors one by one. + for expected, actual in zip(expected_inputs, actual_inputs): + self.assertTrue(torch.equal(expected[0], actual[0])) + self.assertTrue(torch.equal(expected[1], actual[1])) + + expected_outputs = etrecord._reference_outputs + actual_outputs = _get_reference_outputs(bundled_program) self.assertTrue( - torch.equal(expected["forward"][0][0], actual["forward"][0][0]) + torch.equal( + expected_outputs["forward"][0][0], actual_outputs["forward"][0][0] + ) ) self.assertTrue( - torch.equal(expected["forward"][1][0], actual["forward"][1][0]) + torch.equal( + expected_outputs["forward"][1][0], actual_outputs["forward"][1][0] + ) ) def test_etrecord_generation_with_manager(self): diff --git a/devtools/inspector/_intermediate_output_capturer.py b/devtools/inspector/_intermediate_output_capturer.py index e3a904487eb..c1f943bd02c 100644 --- a/devtools/inspector/_intermediate_output_capturer.py +++ b/devtools/inspector/_intermediate_output_capturer.py @@ -7,24 +7,57 @@ # pyre-unsafe -from typing import Any, Dict, Tuple +from typing import Any, Dict, List, Tuple import torch from torch.fx import GraphModule from torch.fx.interpreter import Interpreter +class NodeFilter: + """ + A class used to filter nodes based on extensible criteria. + Attributes: + metadata_key (str): The key to look for in the node's metadata. + op_type (str): The operation code to match. + exclude_ops (List[str]): A list of operations to exclude from the filter. + """ + + def __init__(self, metadata_key: str, op_type: str, exclude_ops: List[str] = None): + self.metadata_key = metadata_key + self.op_type = op_type + self.exclude_ops = exclude_ops + + def matches(self, node: torch.fx.Node) -> bool: + return ( + node.meta.get(self.metadata_key) is not None + and node.op == self.op_type + and all(exclude_name not in node.name for exclude_name in self.exclude_ops) + ) + + class IntermediateOutputCapturer(Interpreter): + """ + A class that captures intermediate outputs from a PyTorch graph module. + Attributes: + module (GraphModule): The graph module to capture outputs from. + node_filters (List[NodeFilter]): A list of filters to apply to the nodes. + """ + def __init__(self, module: GraphModule): super().__init__(module) + self.node_filters = [ + NodeFilter("debug_handle", "call_function", exclude_ops=["getitem"]) + ] + # Runs the graph module and captures the intermediate outputs. def run_and_capture(self, *args, **kwargs) -> Dict[Tuple[int, ...], Any]: captured_outputs = {} def capture_run_node(n: torch.fx.Node) -> Any: result = super(IntermediateOutputCapturer, self).run_node(n) - debug_handle = n.meta.get("debug_handle", None) - if debug_handle is not None and n.op == "call_function": + if all(filter.matches(n) for filter in self.node_filters): + debug_handle = n.meta["debug_handle"] # Convert the debug handle to a tuple to use as a dictionary key key = ( (debug_handle,) diff --git a/devtools/inspector/tests/intermediate_output_capturer_test.py b/devtools/inspector/tests/intermediate_output_capturer_test.py index e6dd782d887..7ad673c7cfe 100644 --- a/devtools/inspector/tests/intermediate_output_capturer_test.py +++ b/devtools/inspector/tests/intermediate_output_capturer_test.py @@ -111,8 +111,6 @@ def test_capture_correct_outputs(self): (19,): torch.tensor([[3.6000, 4.5067]]), (20,): torch.tensor([[0.9734, 0.9891]]), (21,): [torch.tensor([[0.9734]]), torch.tensor([[0.9891]])], - (22,): torch.tensor([[0.9734]]), - (23,): torch.tensor([[0.9891]]), } self.assertEqual( len(self.intermediate_outputs), len(expected_outputs_with_handles) diff --git a/docs/source/_static/img/swiftpm_xcode2.png b/docs/source/_static/img/swiftpm_xcode2.png index fbe5d51fd4b98ecec043e5454e8e0d1cb5fc8d04..db811ddf05de2d1e69a6dffdff22220290b5eb63 100644 GIT binary patch literal 55550 zcmeFYbyQqSw=c@xaf4uuLxNjFun;@}8kgYESa65nPR9t+xVyUscSwK+g1ZFUbmJ0e zT$*0Dll`4@zH`oZ?ilaBJKkS!F<8A;Rn4loX3eVl&EKjO@>)gq(S6GMczAe^WKrGtf=$rCRJdq-DcFERSxC4_O;zmmD>pZqT3 zW+z6ktNi+jl#`3aljmH|xSr9A-+%JtiKvUYrLelR%s<3&cVhI`Zfb)UKaLx(l!nj zj;^>i#Cdpxctw8;{I{(CX!4&#b^l3}@7aG6{im$|Dyrpb;UeYafNRuET*J)80@t{; zliNSq{pZYo2#Rw5diH;^#ovbhor<%x_gsA@VuFv4PfScqS6BDt%a>hUT_-0e zFJ8P*R#v{g#%^qETw{U7#l@qeqsYie6&01n&e`JP;>O0t&d$!++1bs_&5Mf*0|Ns= zK|%UR;I==2LZQ^v)z8k(^7Hf8aLuf)zIpTJX*j^n&VF!k;O*`G?Af!vzCKAw$>!$f z(b3WU{r!c7g{rEm%gf8Z{`w2&RqJbOtLv*qMn>uB>2h*%KYskk%gbYBW#!`HDk>@} zD=TA(2I}kUOG``5%*+A<1DPX%p`oGP-rn%=a9&>CgHvE{Z||lb@P|Kete19pczAq# ztf!}!mXFH@FC#R^WsPXafnVA_? zRaF=awsiyu2nfi_%g4pV<>chHw6xrQ53oi7OcB8P+Pb5oV{23|oB9R|Ieq3H&c5`#13IP%m6RAQ0UteEybMxTf;BVi)J$?_|^Z}%$r7bNj z;c&Q>mDSwb+}pQrqobpl!-0l|hDRTPyD(s6WQ3K4MNLhOEgIO`+Vb-9QdCr&nwqk( zu*l5J{PN|Cjg8IqH9#5!xVyXGg<**T0T&mS(9lqOd;5zkfZ!9*-``Ij479hmYieq4 zqJiQe;Or7$iv%Xt0R#d;83@ot0PIl!VGy911tcdYBbI@qb3h~xh-e3rx`Ej(Kq?7f zVP*c@3{;K*e35`cDqvj-3?hNOV*r{9G|vG3^}zBTkkJdg-3LqxfVY*v>ON3A2{beT z9yLJ2G|<%4^lbsy*w{q8SA^l=O;pQEztZ&jv70`pN-~Gj((M}qJy`iDonMk8KsonS zy~@qoAD=w=^9WM=E$nf^ao_`=R||L_2N1IKUMq}@lv002bZ;slX=A@V5l%^Z!D!xk zxb6B~!X!YaqefFp$BC8QgJEBXcf2qIs-orDcA2)3wr#RTMS&ZODz@7_H|lP2a!kFq z`vVUzwE4zJBds$H9^T89H|`FNxQYDDo-Em4^uH!xymy;>5Q9H|U;cl1(p{wf&SyN_ z>V-zyhm{g{31hwyq@RI3_<39kR@GMSd+iTgP*|;i@V@q+5Z6D=*-j&tHFuUEOb4cW zH%pK~MI|M|p4DhqZGhQb7s=yZaUezJe|=9-DLU)TtyUn>TMUM9z-^cu{!8SC;-t(U_PA*A7%C7{~7U#+6{AI#&S-fj}yn)kv;zj`m)!jS7JETeL%RTQzmf0sivy1q+&YM2=2)}Yn* z@!;)yT>*CnSk?2+V!cWYLgQXyFG7QIM1Q+mo8PMVJH?W-C2NmQEGSda0--HK_`-bX(|CXrfAS@%6};_XH?QcRc3h)BDlzi6r(H;Zdl zL5f_bshb)R^l=WBw<|?h^Jij7<<-{uy)tag>J+wuYSvY-_pH&UQLXb7EhaJ!op53w z@ni2&uF4USf>BfWsEXpbyx*wu^A{u#_$$)dORL6lrJ;(DJcNsDSJ3B-RZ_mXO(!1h zBn7P&51*tRaln@aGx?!%!W3$-+W1X`5Xw0=pz5rMm(}oC>c}T2pA``M)?$kMoTF46 zosxXu(qYYtJvw8E{yLur6*4a+DpqhZy9vn;d1kiJ{nSVRu05bORvmrOky~w|X>IMC z7yFp91E3F*8xG6ldk_qsz_4-9fM5-$*g;srr*|=Ib5f zA%j9_+2?@kw->bzsk&or4*Iv|IqKy^*-O(3c0$Ua3ykIhZtKx6((!w!bG$SZ9BMdr zCnYO1fne@B zd!OtuQnp`p(hkIA`%-dc2Ck#94O#*;QmWuu%CWk}K4&RU`r3093F78YzJHeaIF^7+ zia>2hgec58vx*W`PwJT>CQxSZ68+}%l^j~Edv7|qc~NU9=J)RqYK;Mt_LgP)q1 zgHj_ZxZgs?k@8)3hg86Y}$MP2dq&J#6a5=+ND?T49;W9k*{i(xXxPrI|H)Eh->vA^I+q z#NF4-XuDKF_0`BiEtG?s>RZ?0TBVHkq>F9#RB#0X5<%5W?7D;$u_nYDrWqR_YjKM> zgp+ZDf^5{trNkLxp8Ona{lcp_G{Vz8%DOMq4|?tBaf*HhsY?(go$J)}znR67g}4)4 zJ4^3dSZB-HT~W(AuX9-3OtDrwB3h7=m1~;K;*5xr`6wn0S1th2Nl()srSWyo5>`N} zR#cku6xKL8YgtRi897EqR3vyCrVd&2rHn@vnFL>XN?BQL*tEe@nflin3^-+NohyY&tVYmv-pwGp!Sz`3 zwT!E`(x)?YMGHC9df#=Rm_ajW9k~xtr|CtGf%TR&Jo!U~f*yYPn*$UhBAvzPo~uqv zXYC7Hj|D_BEj$aN7XQc2hr0giVNQS7f#N`IsWj6XR+Z1T&e5pPKW{>BJ|X8>uup6l z7%WTSBrbtC*HDu_>gQvpg*uj1R&IPGt?+bdPn#{07vCS3=xC9UQzMXs7PngUM$G1e zeB~QnCybQc5L)F&o3z`R`5?9H$b^XeN8>NEwQSIQkv3DNoAPBFuyM%1*`r%B87wFJ z_l@w&vyN3W$#cJ>3c0sWUyRl@*s4v2KM&=8(vo6GQh@O6A&Z3toHuXICaYOTyD#v#o3&O`TDH2jR5}K)V z%Wb(E%w0>Y`qCQCy?bkId5#%ZE&7Cne0s-79)OzudOuBMb5qdw(wN zGoH@|y6G3A59Y2dY3@T5t*lDiNKTG7rMNJOBqw(iZ^S!6qk+i)D=IteBFrt^`lI)lLnS3@|^;hc8r|^m*PuqA7 z0R{N(N1kVG*zdJPShMQg$1WZA{Om|xH!Kq)a zD`g70i(7-=gZ{Y7bdu%x?igD0RFK1npHgGlU~!y+s$!eK@>9Y9AH=A<~n_T()pd#@MK=9$bSEyWew9$wcM)rM08E?JEGgJJW^{^LO0HgkRh$M z7bIsJv`4BtA4X$efKNw%h$WPdpp&EbI@mhY@GG8VVhpegKXb`2S_lT4y_6tCg4yDP zhHs9e$XYI+kV^@ZFI|#{`RFJZynJ-fybZrn;S*?i{qyN6ZmDus^Q$z64GMnFyYcH; zUWs_;(4L#vvv1zv#M52erQ_;2-$6mN-!3kR?XyJ~aSd$$Fu5=U!(cP0FjZQ2#r^24 zLo~KxGctp8zw#N~qq*y|W!S*-A|Yx{Oi_5y0b4p>fB8z>lfJ5VG#OVei|39|Mdh{ z;sugJ2s;JtF+Y)9-f(g;;r%UkKODyM{S~Z?@hPrAaOLKC1BDxpMe+yS4FlG!yQ3$> zq+eMnUv$*j zp|SL|ndnL-@L?N9^-}^1a)46xd<~anOmu+DN<>#WKKo3^bq~E3!NQ2SyOH?i_C7b# znGD#jN?466LU9LkTZyi0(5+yI0(!3CUag1@?a&n>6?r1Cy4Z*FLj+9WoN!n*|Ic5w z8Ka#R!&UIBz{mlT#Pm&4Sin6X=!^{gA_8Wk!2>y&jMapR-9zt_02e^Or=!?(>(-9o zUSrIFJnt#>`Pe>*-Mrg9G??rxVCpoJxWgSiSN#)c>49CYxi(15@eng{amm?_sMdQY zR=1&POU{?zcG}xlL0*{Njvrz2mREd2jfN@RmJ3i79Dg+R4IGWyR{iFtcxZ|)x}wud zTK9Fz+;#DDx!N<=zr>;humdSY=$x3C+f8o9^51Pv0Y9j0+_e)Oj1F~lHNp+=-M3x* zhTcOxSM-|}L{s#hCF+u2y;_2GcPL`upOT1XU1QLh{f5k`==r>ZlSxQe>7L;D1?xP_ z6xcsroDk_PtWSg*FlIb!nX6kx^|E7KB`05WASq0*{KmPSO}|F@(pUiV0{USisYAyBH|{F`%l>;{QPE$H44{d zLzq>DMm&WRv|*9;QG|*66378^K<1YRiUBs#j<`GTGHwp0+49j*;4z4w{M-3Ygg0f9n3=Q@4b308c0o_gEmn z_nS_St`&zeQ((8@`#qRqJTwe_v4z@FLv~D!4sQusiXyU8F@mVY_;QHYPYM>)je}J5 zRu#3xH^G3}1z1Ub0CIx9-t#7nafdplRNapmd2lW3cY!p9bT2aIC$%d-cT~SdT`mU_ z(TZI6?71$dC8ERNf`Z3!3xt^~;ixUs^q`s#L2ik`m{DDeq8^-+FA;C2SjGN&=`#PM$kZa+{qSt}PRc0~KpLcess4n{*PdQpGvXDq)QVen8@&>cF6R1mD#{MQTl9H+n~Scs|l7DcYCmBssX{ zj1D*SVd3PtdraTM;L|yjgiL}!eFJfwiUzX7@6|SS^n{R#h+xP<&Zac*is~G^>!WJ! zNhENjqSpW2IY&B2Xm0!}bAW%y^O^s%FMuaX0bceQpVuRp*EQt)&b21TvV9s8=TxnTnj12zg` z9+Em`ha)@FJc%V<#YmCQ``3PbfcemUj*(^&_Q7x&Dzi~YJS7W0i-O@V$;Dc=7T&_v zJCxpA1r2$Qpg$L+$JZY85V~PL*~FAXg4Vj2B)}-lgo}BN!8xXV(w+v`oh7qAm8+iJ zX}m}dF2&ixA{^%6!}FbfF6uosX@~NJ1@971K=QQ{^P)lQWp#4I;p^Vo>s1rrTlbb3 zXNVn7NI8p>#4`58Yo`M-L!dHor{{9-xEs}Hn*q9Ri*qb#k8{SXG`3AFbl8HK^LJfr z+t_wTUo-)W?wvg2HrNpYAfh;M+TMs_T0~NpijC z;?j;ht$!QmB9c=YzD!$~tk;}b*sEV4Se+*&om@zB6xp*kL$O0j);*TVSDs?TBiixl z69j2NRTNt{=1RD+zw)s+qB+g^VoDnl;(K&zuv(Dztl`WWg`15^=@S|RqdBI0FgM5X z;n(|GbQR9n6@?WtNC0$Jxn!x`Mm2zGnNAU_I&r=xBU|Wu8Q(8R*1L8Ud0ySyQ9_(f zT`bg}nav|I=3|G}q&GCL_z@|(hiL9I8pJlF;0NzT)`{zsBZ5%Mfu=*9k$R?AL_Ynq z5)bkbeyTJ3@cD0JQgx<*d~U-F9oFDO9SYaDD+%OPrsH?~7LNd8$Cf=CQzV<+-avb9eB-H1ZWa z_SUxw79xN*iP8C0j8%nFMTZ~|lbE{-ET=;p%GJm&GM@gbX(^>@RhG{MAQS-BkUJ`fZ#KI+R39CFzim>@I9baq-K9QJs= z;u0+s9z#{vC3T97?V>*{k`{Fctybq9!PP{cKDABT0QpZyvm7_#?70u8q)=oLufDSM zlXmoT{071;Zd-*A@nIP^kur|R(qNBa=@F#3tBp5)l4&c>*kkS^8@>T<;^g9 zKpsq(3kFdxhnHTTwSsyj_}evTJ7fen=44H7oSn<+b+{NmX($pNyJJ(6kEP9QzNMbv zhU;w>M(OaQ1?1~a{puYY<=oIX%8A;y5m9uGO*v6_OwJ-sJ>naom-Ooq3^<*uu@WSE zj^Tz8kl9)oXbK_x=2HfrCa#kBe;AhkQC7b!>Obz|C+>}I_RAMH=&tMQ;PB{&kZq+1 z2NcX24|Ue2t-KKPmvzhOg`*CyWic&1i7~oL_h=8E2UM}Ju@1MmsZNZKXV8$L!(z6! zrpMDGgQu%8f#1HK-a3r+dI@Scit|(+W`Mq6DR`3>hc-icSfkeM9UZyY92}m{C%4um zoi5kHuU%3gp_r^AlyUT#;{DVtj@GQ=)Qv1#FjuPp6wc;D8o)PCg;+TOCNB`ISQu_W z$Ar$HF@H2&H-jF;{UMFHKK4jptccJ(iQ1IzJ;qEkN=NSNj~0-v_L}ArWd+l%UaaIj zN1Hs}X(VGF+(WeVlGQ7h%FD9lPg;rvd>ExSKB#hkZyO;_uWaoMwLvGVKF`-s_wDX2 z2w_)LNKsTMVl;O8*n`3i|HGaR_NF9CFp50*NX0FSiuYAJ)aVemifrCbo_C_ojyX({`LRv9wAGj08i~`aFx^+d0DL^!g>NaGX z@uFwiOCbTpOwE5%`37%*ViSFS4Qv*iXM5&c2rMlNFJ+|IdgSy@1$uK=TkoL`KQMZ7 z#`TK8CbVj4hocl{3&eeAcZ7WYY>izj(5sbVOsa8>7dMX`v*+d%E#h?7PH=b z!8KN%9ZfJHB8Q(ql1Rc+ z2fTr*cQf=ZfAzD5ew(ZkOaTxV{E8Mx&%fnqO-A(`H#GEzcQactE{`(0%u zKX@=aG{I;*`r9p+ksoyh^Xu5~LW4;x0eF+{SdDJZiugMV)yFGP(MfB1h5Um?m!4ih zQ74Czbz(PNy$DrJla+R3dl>9}MIQL%7o22!YKy6tLBd(RcSXG$Hq$phqS&|CKO3Ci zTG7ZH(b*_L4D&PCD`Re0N0R zIFDYN9PdOT+k0AJ*c_|X4NCO>7V;ig}XwD;^Jb*=7|f~wT}whcgumoVQv{`0bN3#rN*q04!;HV zf}zYz>=Q1f76Z5N$jyS_2gp@2bKL0R5yK%RYczqTx509N;Dx1n*xB1x!$Us+^I%Yl zSnAfu*HdmfEtlBq9?Z^oVK3hd4S_h85{nXY0mxcwqf(dmT1sHh<3_9fAZ5RI`>M{E z84>;O8;-Hy``F~9pTS+f5Ox@B$V)Nqp~W~7t$Y#$exLj)cpg{=``=c%t}GGz5^(vM zHp9lve+f+`TJaKdjz3U&r_r&p&fck9vd5&iaGQ<=%?eC>7+JzLHh1!fcL+#uoA|c% zv;RD##-E4DWJn>bEI}gJkp^|CQ^3Af64T_Exs-Fd{JNtz3_c=_X~Vgz6K4Klb5ga^ zMLk+LN2FzT&&*@);_CFuxXQWhs;Ltmc{z1-^MA&jdF?`2g5j6KC=n3U8EvL z4!rB6{C@tJ|GYK_D1@_Mjv3N2ZI60d1Kg>COD+i1R)BJc%lMO1D1!yTD zDppUci|RdCV9(Hol=ksOu`IcnafiVEPm#S;MedjvBbSkZ{)MxJ`yrLUvOZCQl~J^!mO-41IeV`qL*I8Sy8v+dl}0XmceV-UQoA zRU+g7HNmCf4ykXzI#7%Ei^U)TN>tnw-tthO_sNV~PXCDz;qE3Mx|bm`%o54+i_r}f0+0Tfc0V`DPH}ib-e$b z{rJTrDTL!V4>iEJi?JB%L-d;?>W_p8$@gU3khr8dhLKDE7Ai(pJ~xY*!z$rBCer`b zWb&Ns9uL!(4Mvnie2dcGj8Qb2yO4e*@lz%-QGmkvR$?YGHumeq)`qdP(EB@HH-9V4 z)YBmemq4duyU(r$RFO|4t~bc4b~0C;iLkh6o%?SM;C%mz0Q-NH_Emuac)3_zW;g2n z*XUu!*RStg$d+XAzBVvKPje46-|m?Bye9UZKS$Y%>JPk8`r5mA??BlQya4iTMFqQT zh&2y=pkO}*QQn&qy()e&zH2iByDIxEafUCf+;%6GE#fj5H6*BI7I(yEs_ z!yvU@=CEwUOljM#65_C*3>df7rPjv)hb!%mCZh+?xRxcyO!R)3dcVhx;j82 zOcvJO6l)JEsj7~I&79Kf7^0NvqOX6t>p3R55s$?WG%p)4=y;8Ztuu=<}>|@ExEmB@TDeh*~R4Da9%vt`6nn&!luhQloj?HUp4JQgaXb0}{5!TI$618{RTg&@=M^EYp2glEkP3dO_ z)}X3XGudyK?p5@L=ywb|n}lrM9b11sQp1*W^cnP^&CR02x^*hb|FE+8yFv+C(JH!O z2({&*{*#>8-->Ao^*J)LsVZ7dnY4JM*LGN$ zTlM9|_#wHSS_@?fL{=@maIhKrx?(Q+wa$BIrms!O8Odz2`l5ad2yK!?XxVTqw{6Jh z8}Q1QNGjV|T@U}mt=36XC*rByD7gJ(nKn(H&bZ#0c`-cLiY${*xev+Z%Lv9-FQznZ zI(Cx|6E@I*6FfdCQQ?RC1+yIDSfW&e6o%9<90u^kVOn=0ZwFChg^j6`_U*ybjuPgE zKE1eu)ldEh?gxi_!`kr=ECq(`sz#A@ddgRPa`1}p%s>142au)k3xDwpI}clEChn~4 zNWqOm7Z-U%zYwdc|%btswQUy7!rgvQ3d9l*gB#pV@mtd_r+Dza@>k{r~sC4X? z%b1$7IvwkB;gD>w%lyMa?uh}M!sAlxHG6$2z6jl$=<+U#HZ=LlV_Wa%ZBek1*8}}C zNFN1Fma*)^{P-3vS06b>t0s`@k8*Dv*9)Vzr$gJpH`PaF67aH8TS(1(@rjLnObT1ThTI-g@XN)S~Dy2ltp5CQth~nv*L%p^^e! zrHz|2_fVM_H|J5ycs#>6B2dZ1rs#3rs;w86ir?JQi1SvrYi97DeW zU(UamRM0hf~{cC*GgWWWJ^4}5+(4#E66!88n?Wg7S{ME!*Dji%smRZ7^(h&ZgGV2kApmi{GN>sJc7E-dhDsA3OQ=+(9+)?i@~1&cTgTr>b7~7DMWFCL)C0 zz~WQ<{tq|fwHfM)Hc~8HoYg06EfQWBPxL$g)`ap*jEiN{VBBgrtt1XAsxqmW*KB|M zEvaEvA;gr~DZWB>CN*LG8JLpTP2CuhmgTDDMD1M?NZ-k_x#_7ACxuCZP8PBmFtU?e z4osnDW|~fROU84m(i&DF+|4p{0ihwFIpQ%BZD?!Z$yQZRO1-Q#`rW0d99Vq(&o`qM zQ%4(fDyTJVO|EkZ#D~}K>M`v38G$(2mzdTIecF9R%_fL*i=(3*=U?6ZEB&+1H_sgLq$Qqhfe;cktx1SVCs~CSz{|oONMHgYuwXVfBGQOd$g=SDL zHBxp4`3?dL_lpn_7^3yZy}Q>(isTv(vxFyuJWp&+-;a!$wiFa|Rw>sQz5{8C_&smn z>bmwrBC(B=$QJP_$~iVsT-;{kpI6&Ayrmq!8=wKls3df8ar++6mLlJRn-z|OZZk+= zHh5^U`eerk%W|)+gnm5LhOj_qmZij9Wgos0{A3eNv(~l=>)Q*?20_-}o<>X}^=gQ( zU+zom*Ugp-@x8f`RTC?5$PvD4T`aCv5j_q9f~4ZK502j&UzYReNNP_Oy5f_8eqN@f zTi#@leRO!APQSo*m9(8x!SL{@HX~eRw8>0q^t*sIEfIt1Bd5+fuf}+Ykhtj9`9>W361zqaK*;pLd>B3Ek0v@75aHL_4f)bxAb+v{1bif6O}#C^amuifvBWLDuh$A$zs1LuNsZk3GRqgGvbuOVDej zYwu%Ye_x_k@LPTt?{)3eM0bfURyv-53skn$uI{|}p=GdTXZTTWayy>&0YJ)t%lrR>a4v~-08nn9TF8%WB znK^?Bv5mCd=_>){5dpiDK>b+hyy`aHl(`u3&>cMSSGExCJd7cwgCB{tGNs z6vZG@ajHRZ+oEvi)+4;`m`lrfS!+UQnA}RZ%V&EQU4QM`{6eego;uLL?QOn4;fK|h zoE$GgFR#aHlXPCZorQ{P^Gtdf_4ri0tT9%FOW|yZ-=fegY^(_ z5w7Edh)>P;6WwETu*pR_Ozs7+0T)B`11+0`ao3zhj>>D{i<~aMBk@)*XL3&=T}oPg zA4^LgpVoS^rfo{6%ivwl$^n?J3Mf7{Zdm60vk z#%#MPDN+tkMXDHOhiN5QMCRUQKkztR*AXl(t}~1S-s#t)^yf%gH{7Ou2Y(w&9~=KP z+vrV~^1|?)*Dv;O&Y-%>jW-%!NV|Qqc1_)wBXQEDfN~y>$6x1&X-l1c@7QMMDDL4- zhb%phES#6eO&4xP#r(@hy9IBd=da)kt2>)M-W%d$d_6?f-!D{WP{e0i57c%`PF6b7 z`3UjV_nxkKKV{goqg?u69Y9k(5wKs$87IG>2&b$T6n%$|MUc07zGJP;9`(5nmoG4&cO z9J(!PF+KIcWhh~lOLwuAM@GMqY{CKJT#VQjy~s(he(~j;{h~4ZbGXF}OXjd5`1HBj z5Pnmi3_C&Z?X7sXX)(QRdgL6!N}9O3ae!7H{!z1}M~6#{hA8U-wV830A42358~RNH!^|@7>vn? z#gnUP;%mZ)LUni(Elk25L$0NCDa5K80;a z4q~)RmB=tg1i3ft(j@JWak)Bt*E6t;jUDg?-e2c)oE!T3X zU>q(#z`E}dlI=ezsHcFjhN33bv(MlK|<@1$@KGG>E$V7Zwm&8 z^CD-u=1<(!Qb$qdI@dXz)z26V)VUH_6vIE_7qTF<*<~V_)^~9K40kFJE_3iRBF@v=GNmg z9Pw8Cc-enmB_AQ{#-s$-M2=6%JipiE0Jlav+yuKU*FnZj^}c{|X-wImnn6y29_9@5 za>1ZGl66U0EaDco+A~2PtJ4ks2N$W9=6!v2P=b!*X}th9~vA% z$B$Pl{xt@t-}j>>n9k`)*SynMHoj>~sjpv0zKOt>8KvKUo(Z1}8Riupv&Gp8ZS%_= zIkcb!rLvpSP8x^}-iBG}NKJJ2*`(>~de`*Hx?3Fz>NgWcN{#ph&vwXdqM44Zr$|48 zf&`QeCy?TM)ss2>0u@Tde&&@subzG*4ZgC zM6$~9&b&shRb4&B4dUq9zFtvHaq+__c=&UXRd@xlgQQ`$6xZ74mp?j`a8vol#;ML1 zkn+8dKh9WI;~_eclSmGFAtevrOX8d zzfjYN-mV_IW42_E=qqV~5y7Zflz+^R4P7HA8KqpOY<F}6?=5Hc(S%}aknaM*zn@5=RQKoRk)!7!5Va43+loj@tvAcSz@OyF39MzeaGNy zyn>NXISieXp}G)7htx$jXtRT>pBqYRRgL|Q06K@1Yh>vHZO{ql!<&gm@@54HL;n>2 zzg%l)D{OpRbpFQw7#;d!Z~eCXjYtEtOa|XrrxKEyyer#ef{cD<)^YD)eUMc&V>^pk)gXX5tOT>;`K^XeH?yzam}kz zNzyM~jtYew74_NHikn75qb$PVhy`sd=dO^$4$MX??iu>GrXH zp{ssPdI9!4@?3PzA2YIk@RY1)2_3j^b`+5UH80##fZKDZ($O|wZ#=?zs1}@lg*CRN z94ZiyM)>7Ar|m^zQ%ZTC=+o&-^5r|a19#FaOEW_06n1n!msWvakFqJRM-jJcS!q2&FbBp_AdSM~qU#%JcwW9}F;8SkJw3-f?VwT`T~?_~Q(pt8DwPC* z&tvpct(pV-4WT1+PN-!Dqr={C5y?k%!Wunz)Lb;=Vv%ApB|H&nLgw9OXyw{;n&w)q zzdyq-HdAK9JF{@`8l(B?=rDl%kIg;A@v)jg5NUVxVd;>pQyf{qa_(nykw>9tzLfM8 zBQU)^O9R8_8J{a6#hXO=n;vL;`&RK2Yu{W>JGta^!vz$6&(EG>VQ;N_e#IIzXvT;Z z+8Nv)T({4zOKx<%7XNobj$gHyDq!C?TjGqPkwRV7ace5^cyyU-U#T8B_taNU19Oi*@@*`&*l{0LPfTLVRB)cPW*NFBR)Z-|V8N6Kl76pS!sb0|K`n z3%50#FJnDub1a2~_dJE?neF4H=W=bOVDagF-j=e`;Lti*s}T0ehl*DWiP?DTNb!Lg#J)L z3(%Xb;^%7Xt2>u5-=DT5MysApt@Cx-B8(_CFZ+!+rhQ&b=GUITNUYY1O#Gh+?*hc5 zhKpsDr4ic4iN!%ntYG|*B_WhiYuD_vvC15`swVFVg5zY|w>~)mcIt@&<|q9f5j~wq zxdF2>$ewKxhz8=G!MkJos>xDT((qPKJl=+-Pcx-MG54C`R!-xjse6W>nj$~7u3x+# zJ{xo0?&a!)cQIm#xn%As*<6ucxv*Si-saEm>STl?T;dEVQ3%6RR;VBSejIJ}_gg~M z0{wMGv?1TEbA)=ZOxxH$B#6I3cRfy7!RvKsMvioynN%yi{|CUl-RxfibFocmjbJ)$ zJYz@UG4{4G`l`6Ege+XDQmqr8 zzVCxBuo#<9Jh(V}SjP(9rkM38F|j95m7oPz0%I~cAM0?)J02vWsa@wH7^`FY50?96 z_h)+6n(s-ya4CZVJP@RKd-n%vhZbBbr4!tKjr)Stp_F~VfKR{hZ=O3&=ubllcNTGx zG0zhnqvr1bjkxVX5SPh+=DBk?{)6WpGi|Cslso}5Z-<9S1bewSOKRsAqtLda^e6f0 zKerE}8zlW-9XNd&R!MLUB;w8(rrgpne!YHJP3x0o%x3(M*Vh2fsXVdH2^B515lt0! z)|q&|ArH2dayU6z6dw{P(C(kxp%@ygHT&{su&-q8xn}TwyF=kPlA)QX0K)FPRZdbZ z-{+t&4R^x5M6_q#Ya$Qv_-S|xT01bwNrx&?zf84zyoOz0w?ghOv!|PIS$Tfv@*Q*h zLGjV`G1M*2`iHy*eBmQ+>fo>qG=-D7pBS04+O%HBXz?9CbA}!Mo8mRiA{P68xVtKj zhz+&>cb@yg0I_VL(-Jc*Lu4o^$Kvx=9{O-wS`mN71#*RvXa`MADbeI@Y%4sx>9DFJ zANuvsdkVh3tWs%~2T2&s)zTq{Qcu_Z8E?2X!KDZ;F*v$|WBL*Ilv5KS?NAy=;l}`U zGRS}{3CUrQm^t^@h?muH`HD5RI;Zw5s$VlN;A_)ABh1#Pga#6q_5yx7~D?c$WWX}gh9t|4(4{%v&RNfzg`*wR2 z|N3n!<&FcBOK1NpA(2g499^%xLa&ycHe1nKS?RiS&mbRSCgk)TZsZm&Zb?cEu4c*U zDy#KU-QjCT@H<3~BL6t?j)MB--TIm}P_i*nnBFsRZ6%A+gddxqpFYLf|D^N#(}cS)%Z!If*^-D+nzCL3?H!ExUu%p0x^4%G{<|JcAiG)7b> z2VTqg(-CpiN)QmU`6H-Gc|NshOEzU1Ef2=mJ)ZG0eklBJ?6zI8gv>H|^FOoO(zXA_ zZiCJ$n@Js#xBdpVns(-eZzIK(^G>42=|u_!wW_a_!G|Jx_${rwV`i11U3Hu6V|0|F zHH=`Njg)(z@Of@7J}!WG#GT~;Pa7CMLrIdm=0w!t zRzoVsH`<6Drc4{w2j4w`|%YL$VV3*jZ zB&6wkG;{?k8tE?lY;e2$k}8i`$Yq|i0^zp=uLv3Ygkst7$w52cndws6n5{uPzxuO8 z&;*iTNY2ssxP+ds&O)g9Q18s8P1vxL0hEr3P^3e=yoJq;5>2mzC{eAz6q&+c7W8>iz}5 zR6EY(n~>E(GqK_z=KrGYJmZ?$*EVg(jtB$=0Rg24M4D0s5{fkG5SsK30--k%si8{m z5=!U@gx+kSx6nHhTIkX{%)(vHyWg|lnR8~&_{pCiWSK0puK#u4&y)7t0o>+W&|n}( zw0ZsVz0=FdJbEwc5H6kWwb{t{c~i|APG*dAgHe^F4cC2neNU$=WyaH)&O^}pw8GLV zEw|3+!v(t*2`Lr>T=5px&WxVSUtL^;gU=}o^LaREu9ps<91i!pALNi~K#A_=U=9Uu z5e?1@H#u?^%DqLUEv3t|IjIX zjL`r85i`xGz)Ni3yJ<2j>)g4AQW@KAC52zs(0Uq64gUD=6*)}6vW&AaF#tMo-Uo9% zus@9U$=X>*!5u6t>8*4H7`fWXZlo~O~TqHbM;^XxV zjFO0$5y-Q=JFScIv6#6a-Rzg8Q7ObL#;hTxy)O@BTIxKE}qh;Nl`U+#_uWyUWg zIa{&Yd2>x{5e={Ds8J>SR=-I2`}zfk?myQrXw4P!1^J|vT^v}qUr`J1afhk9_s5Ti zmwT^qqLc~-w^P`#Br z-5T9>t0)7e=t4=b^#J-tUZmStfAJ8?FY{NxElHoXvErSv53mYt?T#e z)5ykAvf$g!Er*Kecf8r%S8Rd zKQV%M_=Hd4{IRK~yDLY;=+vJYo2=!R#LA1e7q?*`WdL663cz@!dpi9k;)Xa#78kkx2n3a7sS0-$v=jNBfnnLF$R{?{wmets zNuI3^0GUM{<|C;hFt{v_t(vM1Xws>DZF60#RH^r1ORLnPr!8G<{0`k~_nR`@xe*uQUi;lTdKJ5a>&gfd99!Yq7q^DXDn-tiu@7F(~GL(6H zPR+jk4@9mm{X0Y+;K=Ju?3mOoV0vcRR#w$4ny*dgPoO_L?=*>L2d$ zikdLARf-0Z$(w|1<*+l_yJ!#!b{9qcRaKX^0IJ%`KBW)I)2FICxzbx}V4j&&mx6sD z%YTLCZ6yQbp}gZl_#SP03Y2g|q{%SQh<+#FJf(NsjxloV^L>Wza}T9z!`RMp#9|k{7Y_u@E=Ip|}Uyv}Mmk(AuCTwiHH^CC5*K+^(t?6lD^F zwA6MA`+M19Yzk=SSD^lI#1}C!b5JM6M6bWrI}2hv)pjxX3s7$c=rFL5G*1YAWcmxe zu^+T01ihmc&n)%c)k9Td?GzZD8+Iv&55(z~0UFkk(9h6f!ViiX zT{Q!fKH0tMynWpdkmFLhN{*SBA0zLRNbQyL3g_Z9Q~cvBCWMyP_#|vIpdJZdp-9o^ z#oA%Iy7L1{SuYWdb`u}S>hRUP~?TYVx#ZsSW7C9aIm=n9I<`=@EshGeel)E!O6sD+cM*};# ziLX-XU#+H*K`b+_K~Kf3C4dwBQ~y$@3{Y1>GM>PdVI&u0-4TDD;#_j_LVSD!xoUfG zlSxMIg5ER*$B|ZMqvVbeLS(#?XhSMBJ2l8OArU(m-R79jv=QCwI`ZzaY?eTptP$-$d!Wn|^$lBN}B*J=+bw5~Nn$gJGlR}8q1hBQ1 zl=0}9V0cEH9e9=#c{SB6_?^10E)il7pbnZ?k-U>DIZ`RF)G9%nF6`Cx2k&>+kkA2| zLky9>Rz57hAKd`dZF()H8;HNGErO2-=%T2nKgjxqHC>RciuJT;j}9l@UP$`p>supa zg1Y|&a+vevzmPx8pyTB=5gA3P@ek4A)#_I7I)kE<>y6+TUI)iTpvw{LZ=hcUQ(ZW(?V@} z7HHV4riXU4wb$R@`qbrHQm0ivQu3J&zf0bm43)gbFwP4DZM}%{#Q6BUp&)%vnLE_6 zg%~H`#~nG4;_mm|S{e8Hc|LwjlcO8iqKyCzE)kwOT-Lr8c5-McPitEIHrPdiqYvpk z^myZ&mk3NIxhH*x@XdvA;mYRJU^!iks-e-S4=_0iDt)*6Gx}SRL~uk#2kGRgxd5T5 zZSp;(oVm8pb=__=;wybMar5m>2<8MAMRXZ|x?#?aik-Pl>MMD^{J>u;+-}loB!j8(@Kt-jOp5AO=nh*HX&NrQjtJ9S? z|6>OGg-na^s|wa7ayZp95|jzB9F^$La|MlD7jPxO_Bf!weg53Kq(|ur9bmM@Gk%Ic zKXn2cHaB>$t3$U&fD^^^07CJR(Z%ii-cT0}&KS}M(Bl)+o)(q|&n&^z9BQ*s+aP0B z5k$&_A07joCOkm}en4xcS~VQR{uh1ik;R}3k0U?*y}li2g#kWl+P5mVY00GidwtvY zkMo1%{C}-)7ysS)!ED_*tkfpfS>XpoUzUAh>rSDId@5dqn)VTxYwyyUfOQ~U(bSG5X z+m9H>OcCUf&vK zIL%GW&meE;qjeb-NU>&^4duW(W%amsl_rWN5{0&o#%l*3i*d1HB(@BvV?s(HO|)da z>^pLAc*`H$&eMxXbf?s2UmIg^1;M_Nra>{l)%@P%?uudC4B9@j$zhNy+oO*@)&48* zmz^&i3@X!+47s)-(3xrJLaP=i7u2v1k)KT{aB$sxiYOMJs3%F*YN~67v63SXjV?x) zbTEE*66e}>99mHLjtgJDW6wUJm&QGVB^qXw&$tUx{6>m8liRIP|BBqEuh1{RnxjcE znV`JQ83e2!-dpujs4z>zzwDzwMv46K37YrR}bK3}G8x;~Phb$mYqI%r7jWHdWFiu|#= z?_4;X10>jm!`utA*fi?^qEDai!Rx?6y@ixQS&%G z)TBlfdErfPAqtX-xgAYgFjm0U5-!&dI@cDf(Vt z;O#=a;^adbgDcWIV3B|r7RW`5aRw8ED7b(Dc|SRb-DtQdQt z0Amuo!11wym9{L~zT;`tGoY6y15x7`c$;|JupVUe68nsv0iQIusqJa;q35}HpGJWw z4-9R5c}LM&>!oxH3h$xDC50Hclw4bp!+`(JxA5`{g6jFj`N^_pbxIX_jc+P`maH0I z{;J3m3NnrbZFBkBLqtZopLkBb%H_<7-m#!_-JlH$?aqU{6J$cCbFQAXQA%w+EK`c> zL=gmiYa-_B9GVw=*ep^Ax~}ARteTYG2MXq{>pL{{K*5{}$)k{+d(?8G6|G59u~c3k zN~t9@3SJ1P;?5L6TR%W$1$Kq-^Js2@R$wI68xLwdPI{mARuw4`H|)#jh5l$TkEkZU zz+Xv9y((uubzzNsVx)KAjUT?Y#&6*egvK%@ruFO37L(cE8UV@ z7}U}+--2@>Xn&VZ(Hj=Yd=0v7Z-V8umOCZEC)|aubT#h@wz(oE06_ezWlp3kKtt)t z&?XYX`|V(8G{F!I>R*KVFp-_s(Iw2Y(@mqz#v3d+wqkXT7)&^7VD~W$hhaqQLe$Xis#Bf(+i;fa9TK4RFyXeq4Q^)m%vC@$s zAzk`T)Qw*ECO3}C(R_g2V*_aByzehY=qQ8*;g8TCms&k)`ULqN4hqKd1f*f&xF2a1 zD0vqX>XUb%l8?EPx8ms%8EMtT1Y(Jf%q(gTQ<55%neB-C+OJ-8Q|_4<|EjMxcte^1)g)NGt2Ay zi_Q$F+RnAjE@Mm1iYuvFvLv91@5@a=c0sY)$((m@uIh9og7{hvKa@)mlfY{=Y*+`s z2K|V%OxP;MT}rM2?zX3S8|ZsP7GA0Om>a6D z>+r5(0xU4EQT7VIw&m{s9uyL(lL<}#fFdLu_k089NFFVOM!c!D!x_KHiuGN8BmTLr znL1^&B`9-{{(LN`0Y4mhDgvgjGy(Pp*X`MKuV-F7$gz=EGPNa`^m;fBBSB~@vY*(S za;$|4Eo#3fe(U^<*Xw4%9oM ziMxcBJ7s^83f>*vL2QYej0`-NE6y%s0@XBobt40x77A`g-dr*WUeMw+0BRsY>{O(B zI(R>#x6AAs{*e6HdK~XYTR|ZAhU_LeB2Otp+oJm73e%F>%{(U5{fqKq6=312+U(-W zi(Osk15flR^RLzm3oKK8Rsk3p(;?QL?QJBSE?VW5eD9Kz%kD z`QpF~;R*wq=p2u~SlqhuH1Tx@(a$}P6R)Q*#CD{WuXGJ_5SbZ zVUKd{7MtGr>VCw0RzKXp)Fxv%45~m>3%5Z9a)?!W(it=lXt+$BQdfOvrq%z3#%1;NqkzARm3@;B(CC_E; zf#u^V=4Uy{X1_Ge89NNua*CSe-u^WAnb|ZJ^8#x`cIA4R*|Dw3vjixMz;)aJgzH%QD z;ILE|S&L#_(!?D}iVqUWpiG7J4MgEl1b%O~4;R^cOI+0)8!U$SVL@lA_uXs6`N7IH z!)#C>8~V~grR95tP^^z=fwGOvbgblA$r}d%bW7IcXnT8b7&yebv9H_95+m_ zd6oDk+%)qi$|9G4JH~f!dX`&g>n0x_d(acp{c=*v_u&6r< zqGp4Nr9!dFwY`mVTQ8ilrV-|&J3X3FESNG+BX)`$*zGRAe!^$xSIey^GogAM`|-I3 zLMw{^x?s3RV2+I*wq&P@EV@kjB|-VHHy`RaT!4AEIPjvbE`e^)QhMfHS%;g@NDorq zY(h65czpS>%!nfvtIgPy2-SL_7L#lCT|&|e!48UssIgg^XnW2qd_f=1LxcPg&^(`q z@EY1y@NS2tnSToQ!&XN0E1y7N)9{^`nXtKO zm;u_>#$d8a@Y`Ry$adGiTs^s%@^EA+`!HahX0T<292u+m=y4xYS4!leP1_9a4%S-w zLXHIT^ll(uJZ2iVIe_ZBkgF{X203khJdT-7ds>Fg7sNBqEm8qz0=hy_`65w@@~?Z?K=Y7S zl4GIs=sEJECZd+GR-V*A_$4B=Jxek>l;`Hnd%m%ZyJIc5c-w6P=Hdp=II0Cik5UQ< zOuirNQDSEMjw7YI&NhrAB2oR0_P6@-@-!+?buM!Ygv~%g?{fHhC3;WBbT-Wd2&D|F zqaIP_f_v&nHYBtAO5cHmS<^8@g%~B>bg0CQs-(jhi>VTrvMirv*`!#?{a6sgIIJDZ zB(q=V6%3+`mo03Nx0^5)ASV@*YD^8pGL=nMO6c5&#`dt$3Bp*jjg}#zXC)a2izH)K zU6B1O*{_n!_IEgKEO-Bptc&qQn9PT+o!LJX7i{!v(-qAl2ojo-dIBj2l4}p>S@O}H zOPwi+Br3kvzms^L=inaHvk&kJA#!K@OSv`3oh>aR17HFa~`Fq`b ze&@shjc>NJ4mU3+Qj!YFJWf?1=>Q+I$-=no{4^@DJveA%-*(=86S54DN})>xM~Zgs zmmw34nSL11+i4#}QrSIcKcFKkNUT>ik06vjqD826Q4$7KA2&_}bedaLT%l1kiwLvH zc0aNM*6Q|#+N5`X%|$adz1 z_qsJf5wN~rEgPWN_So*e#b;T9r)Q$y^g$#symRUGdIddGvH>KypV=@SYN7g?eYP1wm@G5H z2;1qyW&9$a4KT=;`y?io04@neR}Mj@j>kj%sd~bCkaVT-l~)?qC|){TK=*iBLF9EH zSRz-W3|T~@644VD@Px_q!H|G~n=hze|Ko!kOq1oco7MR)jsg^)cxqBRjI%)t3z$*= zy^%3t=YSDNTJ8qJp}mT=!dT(x0iWZWeangry19X0J$ z=gz)-o6LN@oUBMy`^*n>w50YeZ_2SCLMMi@H{MRSY*x%Ho_%YI{`D}p8Z2oUM);?nAY^mMwmfrU6evUpjGCh^`gLl!L$&HIc<@XL!HY@FtW1(m z-~q2vnM}0w0|#aUzKAvflQiG5+o_lWqoEE4e6Udl=#*TI_lRvuL{K(kx41`N-&;%S zUq3PwrRMpbw^*O_dzW$c5z7RCC8v`@pjmnfEge+pr}E%LA=)&F*y54@sm%<-<$v=? zf%HtZQPse%?H06s2KkxQPuJZWvr@fy_Ykwe)Ajl0wq#MZ1QhNG>vou}*IuNe3`Yb@ zQ+)5%ZzsQ7IFR#sh=3zxuD8=OLGGYELY5?FaKC;@w9fT^d!&FvR}1OO2z|fjpk_qB zq`N0er9RP(^}MPjTe>%geGz589a;>QH}@R7L@8m+nbyFx7)4j%(*n!KDZtm{%ak?)6m4anX!joVXM~YZFYcw_YPdm8LvGP7A(;O1#+94&DRHFtOjQUR?|7Wh zz~F{F#0R5ty|m{?PJ}cNban9O94}MZz@QW?EFai7G}Cwb{SsvjsIA{p{F`%kfByyW zXM{d>QXS)lHKx}J}6ru8Ab(;qU=XG69fPGWO9U{ z2j0{zW-xiQfQK`F$HyXYVayR6^gdl?W~bx{Rp(WR99E4Xe{X4|?#lVtw0huNqML`Y zIp}^gut}3L2!yELdJh`b%`J7!dj%7LPvDO4vA6N4dO_JV=!9+N#f5z`?H;81@(0)T zR565$)0V;-ZPK|YvSW*?qGpumDb8@p#B@DTh%5d}z=l#6Z_x|S1$Fo{c|E%4rSZ*0 zrD z0Q1+{?pt){7NG0R7!57|qM2At;``W8x?8d7>4W6&gjb2R;$;yDt1sMI4@ke6x0*G6 z?(Mc}1Byiqc3)t-I9TY%FJb!WDYk&=f7>-D1aFd7&>}c@t~)@5H`D5u|F5dtKknwx z0?rfvpUDl8!yC&3NNcmtLASWdc-ed-X#q*B2_5)dMH^BoWLOEri%Q?!Ik)$?5r?C=0N7Pxp#jG2Ze%kuURp*U`|Y z=k=P)qreHGQ|uk?ur>-<=Q0$UI0u>)>m4&lda_USv_lsoWgoj&zT0PI%VnJ4p(2Fz zDeUzHO{eQ12MGqo2=)1`%A*%c6cMBM_${o~O!6O8Jm2f7M|QEF492A05|9nDaIf)} z#Q!vZ&u365=EuQPfH=PFxBd?AH+C3rihY(57I533bb_s5&i68MNi?278)XmHSyRUN zl@(wZP?*cjqTmEuU|`cj7rSO*wz=n#(!PcBM0OlrC`}9u-RRLnihSFXgmWysa$wrt zBCM}E498b#A~gATh-RJOTHL|cXysLYGzubDNEH!%{HG1xmj@TL5K%S4L&XiTuFL~y z5UEBsbS@95v#TjgDXf<^vm0lS4eoaHOl3xJ&J;|oCT96>N6MCT;rYRW~IeQfs@@@F6~ez67tKy1a;TzJeHd&Jskq=VH{dc9Ulc`Wa1^O?2HxY7u5c zioJ%U)S@_^Nm((zn6NHaD-(IyV~afF+{u3UJ@4CNRf*oW+-0OK@9;auhJ)fZ?=&3e zs-|Hol#v$g#4DsQ%TKEl%qY-yEr!Ze!7QTj_k)*0<^xl--s{hdwUDZLIZ#(HBF}78 zS6C)&X5Z_TJo#CLTU^K>)e@HUI$!Ql^nK!-*vnY%WeW?6MZL7V+z;(aoQ@=_=i+Zu z8>}-n)7g8`X(3RG518o}53hi$<@`aBXnb^6XPkol`N|WaGK3ufqIAlQsmx@FLpEb{ zMawad%9;o-NOj4aWn zP&K6ep2bEQIeW6B`FibGCB6>_p$+iN_rd!qqr?zWu%vKIEt6}`7Vp9{CULI$p;HYg zwwCPVFKh`{bV`K*i=rDegjOu!6h2KZ z4eT#6==J9rfS!nE5BE$@_#)htLkh82V!Q4#it+N_ysI*RsSqeqd_I)!o*}u1^%kFoPQ%%u5pFyN6i}m@M9f*563_^-t~VCFvx(?0eF>wLZxk zwRlw6gDgu83kYQ76D{glrv&&3Mf)09FVfXmuzc+o@1=J9gZHR@@g9RQXR)ssL``n& zX6JjexFX9sfZcM?K2rW5T6b1sdChP4Nrg70CXT&*2}UK&A_!d4geM;;^i~m$PuPac zWHdeg^-jkQ;xc5tDBTbE9qOq(k}@!ffP!vY*fs0Lt=i5dYlVOaIkPtV+UyP7j;dnn#$*2_0YGtpJ5x@L6;`^G|@l!^sRUe*RaDb=T+ zU+Bg)T*AV<+<)ZiZq<(h*c`mUct95o%1}Vi+V33{{&p{9_?Xs!{NpsPQk;{GlAn*6 zCL4DUYVgvg=PCAJe3aFejO(Nth7eV~oT^Ijwc>xoD8%}5^ma}Uh%i;5!tB&!TsLQx zkO>Tu4Zw8Dre5${gwS94NQv^#^m;Dqny=Sk!ND=})M#*$#Gy4}^ot=_bsc!Y#mx3^ z$ACgS?~|c`aXyP;-(s>)=evUgG2k)grNuo1xG^U>0vdv$gy!)n6S?ar>L_?^WF4cC z<8IsHa<-bSw&&&-ETTyn@MC54_$ZBN6x6fg*rNm_;Tp(bd2yj%f9ie###~55;&F7r zH9kjhOx~|WApQOCQqR>gg2cj9w00SgW}oKD?)`>1vCD*>C#H&~C~dE!97_4!2X1#{ zsq7SPd^nNoa*_m|Bnv7qE@qd5pWQ12gLiwY0G)qLA|ADBq$iL>^oD(7K1jRkM?LM{ z0Ad8IDqb;)TFkQC9?MggfPJ7%-=FJ~4Ia8}tvap~h?C@!Va~Wj&Vy0NNOQ$b&3~U{AHdgspG z+Wtep0{k`k(A?YTLZqtOH*W@AH@0)<3`widwxne{cPURt?168{D2VFnA^`Txos)d> z0!;|JbT0oxc{b`Y;98aL)((6=yFQW2$5HNbENh(;focv=oUm;~>vJn1l=rx98A z&}psPkh5%(wMT5EnXPij#T{LZpDV&;R7ty!x-FJJBj@a>XQsxA3@(UTtt+r&mo73< zY)g|BY1%PnmEkR~3ReKG_B7b70f95HFXdbKbU(W2aF3v8@5{LCoVQHvk5RMI0_E6f z*jVF`hJ~B3!E9y6mskz#w`9ild(GWfQs^<<_2Vxd!JWB9lQ{-yzTHb4z?@w8<}3mi zTRoORn?C4~fcO_{*u<(Lgs8C&Y3%@IRgDxnJy!LmTE5LF zm_W^xnN`$I;+|gU;sOgj2eZ8Z{U$~=^M`10`0Dat%O8J2(o0PDN4+;`S~7gf+Ajr< zeQZ{R9=Wa#;08iHAp4Y*YPSsDH7k);+J>3B`S#Incb?k3{^G!=Mlrie=(xO=nG%%2 z;~*1RW!sflBkI`{Nlx3Dc?0ZS&-6h`JQg%vH&bx5GK0O_dL7JM@(R(eMhJdagbVN| z=myh%*f|u8$78ky(X!5FGdEo)hohXZq0WdGo&e2bxyd&xXbuwXWw;86)6%(ul|KXK4wn-MH}Q`6DnaZ|7l! z$qe<4=@N~*ZWLLH_0Kg%+j_H=){+k8hN~xll93ai1bp#e}58cMrQj6F_ut zEXegH->dmuqt{Nhq4*<^NOb4M@#7vlw-1D*t_v|MwX^8GV9CkCFh(Gl&7bE>&Rg+W z+`pqzJ*sR2bLf&Tts0G!hFv-u@2W0q-j&V9*=VF;LZb4}Lq$)KvTn+gu6)cJrGBd%KwXjd?jVnD72Mp7aauJ!dir1Fs+IN_{_cXAAGjiyrmi> zcC?cR^94;R1{QM2qtzsHzf`;BCBgda3Ee)FRVr7AC3FCbFFa^q8QH+vHn-jxb@_hXI%Kn0SbF3@N}0KSU1ItNz+Hx+jJwB zwp7g5v$zs#LRic>B<7#Yjv7XS>NfVtIL+D44NjsQ6+$J__20MdDb4B=ABMQiM2~MM zjzs&dJw&U&Y8wiC52_-NHkx1OGT1MPV`;cEXviUdk+^q3Z$R0n=Ca^RGT4G%}K+@p8+CcXe`Nr!_V*o+9gL9cFqeTHYm2md?`vy!`~I0xci zC}yVWVgu)P3)=+gFdxhE$py6(nU9_k-q`V4PX?6EB+i!t4AwD}VTf)Tz6^}ZlvFm~ z>tdUDyf8{1K$ zCCXf^WqUD_S@t&aCb6J?BS2`EBTjG6KhDiZ*tqCq+I^Bf+@yUPG$AkV$FEf= z7VF?oe+N|IAfkTRMol^zG|k7Y0f?>T=;A1D5sAsmoxN%UI7&g0b>1Sb#5@`xpM5YP zltqt!lX)`$_xuA)%M$tXWVKO@kf{c(UeJ1^dFfA3L<4K+%2Ej-zd$h!b3> zmkhBRL_(laRKZm6_0;{b#gorpeXM{za6Yx&sWze63uFG)Yn9@5g;O6u+Nh#z3^0>t zc2d!DBt2r;>DRn4CeJ=0^gWjN^b10>GSDX9OK~^@ZMtO^!ex>NVz4?`y5qR^yY+*T zP{#JI^=4tlr`!3f$iY;`GARG}c~hX$*phYeKi1LXU8gxmumZrfCm)3!9AS2C~m}~CboxbuvM9OZ2W~( zpr5)@+YI}RLAB}T>7=A8!M}x^9c=Ym@~*bQ?Dp#gZQtq30pkX*QX1FtKg<4^1l06$ zSHeH;ePXoY&v>aRAn<)0eOkI&q`d1N1wzojk4_M3;r_ufdJa9k^=XS8)skS0|524X zSO+)d#XhIYp*b20-38VNchsnmmukRS(!CjT-1@Wck8WhpkJ95G{p+QH_8NpkZKQgq z?s4h;k!EBgE)w6GFC|enf34Nw!j`NJWlx}+eIvS7se`( zUZxJ*zS$Wi+#*ed7(4Dn>=P>ZS(Csbg(GZoexa~(b=M@Hv{#Pc`SVlRHPsw`og3O* zNf9g^|I{gPJYFx}L%D?t2G9eGATLXKq2~D?{ZHbKaQBzEbC!z`euRr;k1)6-bx8|$ zR4$=ZI8XPqC^6OJ2TR}~=f;w??m^bBC2TC4?6f5cBIxQC7YQ7 ztD{uiURN5Xmj58ziAdP_VM!2<3H=GT(RYfQp-9c%>9MDF{zrqHrj_=M-zDCc7p|auK_Kxz zs`n~^A}&8K2npp!S35v#C;7X;LwZe6-4#7@h`M6s0 zGa}}kV_Uac%NZ?MH7cEH_SK_3r81aDQB8aZv^*BEM0J!aYO9*tQW%&NtQ-eNPxBDy zLOy@XWNaUbX@Mo(RKVZz5(JDNuKNPOusJ^ZOJYTQijTYO`g(V_Y;|G}gTDH-q(0>- zkjJw>2_}Q^&P;Sc5S_$?3+BjgL-`fbMufH(eJ9$GPM;tv)CcmW>n#gc=vc~38-XFc zZ4W4wNQKMMNp<@cvQPgBB(@sEo@BaN|9ROMnf~&3wO2*@d#<0LEI;16trv8&eEtDG z4Vyuj!%Nut(J@?i&eraQ@z( ztLPUydMuNa5UT=^0n>>6Mc^7#lmqH%!%{~D>Thl%yK#|EnXmm^IfGk9(=)iu0J(W_ z!W)X!IuGVlndNd(s|yGS?G|?@G+oN+nbSz{GT*$S)dx^A1FQ;q5&_7=xVgNFO4cC% zK)FD!8BS#6DK}WPiENpF6wx$eAtLdKzW;>FB0}5IqWEKVfBX#{Hi}yVJcAyd4G*lY z{j?>McQ7`$&5xwz$L!pq%@ncDt-OxBC5_mfZG4EeR1|cOKeh4w;2YCo)56G_l7+8U3IW?&KP}NMLzU4XDSrg)N1hh> zM!`+k_#y4~Oo?QO`D>?N7)z?)6E$`69yCAG2!I2;UkNtW<}WKwF`^g(Ap^j+0N;b$ zgh(T5d9&9JB}gJ*4`LplM1*X(z7#QO0P!jo_UJdX#__Buj4v%@OXh8eFc!%bPdF^k zSNRcd9I5ulpDe)glebEwIch+O@uG9@eoN4qDe6S1>#i`p^%8nyJcstEOpF&n-)nqT z+-U&Q_leRI6GfP_NSI?)Rn<*+wPo;WH&@eh~rZT($5a4PuWBca=cdpY!X~5d| z<<0*g9k)aFQ-i$Er~kY=o5HUlz{$q`wddqO&-GE)sgG;V*@k4Ujr)g&y|_yT4zy(L zagv`{0}AzYHAi0&-q2QwHsnbtPXBbF**OE0quR(LU^A}}Zen~HT)n6dJLYXk zJQrn6T(-?+FrS=ODQv5CX62?rY0vNM+J#w4g^;cMLf5*5e?r$;>cQUN**5*cBVf{v zmO@s>Bwa)4mNj4aWmvFnN6W*C4++<;{A2o&c_~)+95Rf^QZ0$?J;Xy|<@n2-b*#1e zVCYm+Gh13Bi12{OmX{X7oXU8Dq)z!D5uQY%WET8e*3FOmE3VGJJHu6IbO?a1hUxbY z^G{mtTVQ@A$Y+subrYZ^)k8qq+exL;bKr&HpWWpbsyLjJoG30r2=a-^#u;u0xO@=L zrHMVQ1ocyY>eQxt9MYDm7U?qtZo7^lw_S1Pf@%v)mubVN)Dy zw;yg-l|pBFmNz%N1AcrIh}YG8cI$%=L~65&eW~-+L)NHQcT~M6`xSGb^MSVUg)@;V zdzX^yo-&rwPAavV^G#=6cc7mHC+k~DEvJ$ZoUpX``;k& zWf!b7ejv@@Krj1M{l_>~(`|&jGdtx`>Xiv~kZ!0Jtg4O5P-`9wN%zTLD@{ymu% zD^d$-L?q9#g8)goLXiYDgV}l(eyU07=+;3XY)r7THZl*3(`$tDB>bwPo=ufa*8Vb< zjllR=vi{IxKwHMouoaR(Y8C-is6&)audgQ)#PH)%KE{gz@?@M`)wf9zSb%2{t9evW z{tjOPF;3HkC;ThY@)to))EVekNz)7MJeJwSWtzs;u8sJM>aUJ5G5u86Kli*?rGV2b z)YaXuOnvkf7v0x7S!~n>yAbQ5%`dEUm+Q;|N)@_IF8U5$l4ixGNTsArsZQKn@z!H8 z{}IK=^Hrfgg)=bU!`q^?y_EWxB&-P{%hWOHc*z7+KTR0dfss~{T{ET-18UUyeC4y` z7LkX;k)v)CgZ*JC(n+aLeN17iY@3Q8t5y~W|Y*Xh}G||jt zeYkc!Nt6FfWWJDF1l67^09L%ZvKCqPtVcR_yC-$atoUc}fIfHX$8@_d1Aj!hDJqML z=H8$If!bL7E<8+YUiHYK!+qhlogG!fMSl?8A(!nTeD%Zemo94A>nrG`H9BmsU=tN2 z(qHT5G<*H#|E69VHH_B*HGaGAja^wk6c}{5ROgt_ctMNc^oI#he|qP}1Ger%oX*^N zDeC1dS9WN29lZL`B1?(9j1g3mp>s6ftv^BO(N`0od+x_Ebo<^2w;?qBgpA?F9_f(t zg8K94@_J%fCKWr^E-v2bZOtGT2Y65K58i9$fWz{5y!ez@&8+}sxp}+PR_cG8|S4^GFW6j~_cEy_B{0H|Kqk&|0TXRw$h+7r> zJo3CR`j;S+&%M3*D3-@_N%Jg7{bb{S;4y;o0}WDKgQAcy^{+B1PH|8A%zB<7U*F~* zx$T!1Bg6pm0ZFTubh+u7@xTV18A%?KT5z`G(6)jn;L`_c>}W$Lde z3HwI#NR#tpinbXB!Mk$X246&LK#?OFK%=z$zRQjE4QfwJ7tj3YOZLDWLtP2SRTVEB zwf%a39?{tBkG8AM?w$=@u{p=k0t_z=Cp^+^{#U>|Ya{e~U~9MZTY2=bMW?G8MR2-v z*VoA*;~)-*d_7mAo%H#w6EveP$l8Z(ekZ0c63^Pt+YA2PjOM!yvp%=ei)(IZ8rskC zuP~(TZ~C6F&ks~kVK8__DF6(y051?wCnol5_;+Nxk2f{2hER`<8&d6gtGWMOI9yV% zpWHQ{x4G}(Ei-^K3=p;1TQXDBBK8Lb6yA`fB+;aH?T=t~iAhhXM}=)xkExMR$(%$L zn|T1Yv0m-NYMNb7dH`gcCDn*28;$FHPhV{`kwE*?J}AKutByN) z`jO~$y^ZW%Bp0)eCpKCbP}gQKcf0B@%`blol^rgv>>g^4EFRTYm$&!-dr+wmYWwOR zIyEH&dKmb7==_r~{^5r5W$Q7z5gnQ`NqFAg4EIz4;YDwI*}lXFk(0E!dDjIRI+rBD z^ktEMq?iAwjb?vZ?dnDH6Gm{u4C`Hi=$n+ZHWfd=ZvVEy^=S`66meD!g&UtG)M+2j z66y%p@^MG^aih-jr9{n~XE`a=DE7O7-Gk4_Lg>e*#>_aMriDwF{ldRl(7uGYe)YH5 zq6u8dA`Z(G;pZ+6?>Xk6thHe3WU4Ge$BoGq9f3F@9WAh0wgXI3o+VX2DA5$ z>x+Z?aOrZi+@JU*;GT$&LD?d$Z@fei_IA@ivRMs03|D7;DAR9Q<%lDJuWxBn1zrpn zWF~ZiqHp(@E=Ur5yY23&B+)qv1I2DRzW_(g#bLt?^ZDC(z?_$ph3%IQS?i-+eK_$} z?hm~w6sE8YDd0;zoE;iB3k`(3xRb)?D{?!x+6IEA4qXeM~um)AW;@(BH2x1^K-P^{a;V0pKPVJRZ*b#^f|Iqsz2R z++5N8>&MD!MY@-5u8zmLP#!*goOlWQ+lU`Fc`lBl$+2 zJb6u`gB-JGK$tYnLe)q9%+3GmU)ka~A}astUs>6I>v>9o6|EvVM_ou};x$ANg{}k6 zUqnx&IX+s_%QmsmFKUV)LWvu~K*GRXUtuqWh_nXJ;g3*gXL2-N(#`JJ!<=l^nd?BS z_CyMftv@Sv4kxl@YKy!6L&O^C3_j~nuV6*5jc=fKfTq=kO!<0b~AcqEZe;gtdk;! zZ%pJ~<7-(_j#$(t&Dq#^dppM$M3%@{+fcLKE%p)tqBN6V8YTZ#)O17xmpMNzbvC!f zLT3fzN(p)ktY#3Rs5q(R_+?}l+Ws`M^Iv`$+2z6Dek4bmubqr|+VAnbv*Fn}hzXqX z;}BG2eViG|7fG+j`g0XuAZT^VL|OiWe9*)_YM?;UJ*5OF6TcCS#nXU&uc zshBI^2Y+=q2jHt2j*2ZOKjy>eeZRlY0FTx34hZL1DgtDP_Dbl@$4Dif@V(FA*9{Yr z>%qxlzlEXpD7J{p{xQe3z{?$J7OX_n4`?wJgnhVMI5b?d}DKQLf^L|Q@f5l zJ5byx3Q!;}enO8Sesf^}*KmtA?w6*`fer?q<^Gz!AcXlglk2*!B0uk<#*?1Mm*pK+ zR$TQup9`r3`kmT5BL27FS~g1fk*MR{CIQb>FKwWHAhW5+STo}>C`591;MwELRMIye zGL+{!%ylV*b$tR^Py`u6Dl3>06i+bz&||BE)0eLi{#`A@fV5p)dM2O$TT$SuG`EK$ zqD8*y%eas_3Z_Ai>)zpUs9Nv=QLoI$$?kai2u~bZKSVex3~v1x)K-18_~-7c`i8%UeQK z)r$OQ(@9oJolaMX2{G4iqH{0u2MC?IpM9KNJGMfZMTnaIKKqp)dy*lK@HIh$^7b!L zM&Br>#}AdM7aDVJw153A=!MlmjpIb z2jdG*@tS)4-x>*Duf5$RJDro$Qc~qw^s@b5<-KQ6RNLCNixCwO5omG{BnS;imaIaP zbEZk!O^%W?sHlV{NY23~XPcZ6p~*RCB%K?97xF|7BE@?v$0&rq9-rEiPUWlDBQ#p29)L;AO*R$Y;0fBU5gxynGaFct zH}Zqz%A$rLx<|i^V9N6!03Vn+C!CQ~@Gl$K2J4AiVcCP|uHesYJw0aFb)4>1OzS3! zedNaSnzgWDmmi9szuB-j6>FM(6FWiJdjKg4j@&EDO8_%)WWMp20W3fxMEQT!0A}Tm zXt{w6nM!8^tXz7|-RSS{MUK*MVYrc(7j0`zQ70O&9-6dEBL(#wyV1;}A3dQMFFm( z`rZVB9<{>o>ZL*0OF6UT*$26nWy-+d5(=bV32l*|5Gf054Gu(}x(6LiurY4IAHSK=(fJH67YZFJm07Nf>?^3TPA377WmXE7L*vQ9F0_+IW zsP2Hp1>NN-7wuh-YPMecDOiUJP3JLKBo!E{l7ZAyMJ-OT;WYDEUDdPUl^f%#-ck-E zgLT#g43efkiueQGas~k))kaRN-`ehgJH)cV^ZHqB!ar5vjz14>8S>zftGXa3k1w&o z&AAGCRL}I4Mh31*$vhlo{cnPcFK`0r+$H4hMpI%Yl6f+aF%?P7@$)!%RiY40daHbS z3XNfj?yp_ELdf5h-u}!ZAW)cv zrkbJXD8*k&wA8{K$0GY@K8;bc7R`SDY3$e2fNGn;LoVdZ)p%+o_OdfW@uM z(W~pS_$*(baWzj{J1S0%=RtVTal6>s2U|`~O4ffa zFU;uemGyy$HZiZ_4*Nv;fRdttZG{Rxj^gKolDN33Bs@N0x65Z{7=5_G%#)F%UZRDP z`~8CF!pN`lLUXh_ilmr6jVYi3>aZVjCt*qm`FVFv8H#dOj(BTRYasBJOww}87D-xA zHWteTRUuaPxXW}R>hZH3QXXlf2e+zMVca1t7I2@AcUc_wY+XCoseai|74m>t*STXD zP%sb+eC15BEIzfyVmayFM#-{`jk)G~f3c)AOA(@+>un7iFc!g;Cs6v*MW0S9Toqg{ z3Bh%FV{19f@rV|Px?n20T7dldXm^C^lZr;^b5kZcyH}=)Mo=8(C#j%;#fWaEV;8>) zGAQ_YPDsec=li(pCH&54H6lhAOgrKYNP4N2DnUB#)pfTD*YV<-*72ATs$q{#-sp zh{iIYpGe);(l?f0-I3&iibOIMeLEMu5C$CTc0#M_wVC~fs55@8k2+v6k!)@At3vYE zg8XFc8#lzJXtCs>(njAjTw}<=V5i80WHBqb^XxD*a{!Jho&B8c51-B=VNXWKORGRY5+%sF z;jCxHoR6d+1&uS34#}69?EG@psqL9KI-iOVOGFF_`?8E4k{&iQB`+zB8l;FLZWa5- z1BWiVDlFIs{9PWVfB%;}Ode6d?k{D^SVRt01E!+nO@DBNj$4GB`3O~oQ~1k0Dd^lR zx1xorx=6%_rOdUynUCNF*&gZCTFEBKzba>e&mNI<%p|<^;BjM$c9*l+TNfK{DX~St3@J=^n|Xe#uMI<0M`54GNsNH=Xu<;Gx+Exk;^7 zA?6j6dy76FY<^Sl%+k@gJNW+ppTh)nki@MV;#Lnf-@;}>01E(Lm#f!_%HhKqo_ahVZ^Dr1te%{ zC>{lzeUVP&&{CP~N>`3xy492C0_1b61Bt*IM1`!!%8wA2hAK%!C(^7J%6Ore6Nt-* zA(#g&zW4#C;n^ zKO>28H!S?p!aP@5jx&5_PX<}Dqlp&`x{QGR*bR=vFE55KOj7N-6s&iN>-oxx=qkVf zbkL6ji|7VI43uItQ~wf;HIpyjmi-cqn|G9#JpKNie%}R~+pfLjbZ$mJ6Q!Q{Z_2Uc&)>@N66=Y;(77+!ER^bJ?~HIaQUq~#W2C4|k@bSq zPW$V-&Qu}UuV`|&%lCF>|H2E%VN}&J%ox>LIbgfbpXkMbUm&=3`9R<-|EKa@mg`q> z1Pdk=DP}xk&Lu@a9)WKHFAzuYGQHSzH=W~8O5i=>iV|p-XU_>@ODpX0&otkwa#H!| z{FPJ;gE8rh2_yE-NiB|Hdtc6&sQPv-XSS?~B=5w-<>CB^_%@Q<3*uYpbj zepA4sb~Dku=idv`kG=mF1?gL7=87P-oAlyj(40V4@Tug!1^sGmJ%R98oE1~1ze$O` z>Z0yGectg*wmJ9Zn$s?AZ1w0Q;W{b`YNH|Y@hRK zUdshZkNumRe+3juy*IIN((VxOEaPqX_13!8j;AxUppDj<74sFx3Mq@Xq{YIfTP4%%R36uo&K)XM5|Jvs|b zZ=ga=Qd;9(qPjKoONVw@`K?0}o9fd1$v@QkF)ku|QuyN%OZ2KHCq<{PXQG#5k>fd) zL28z4U(vFqkfTj}o?=UEPW+C=FaE*OzUtungD zv$=*PGT^pZ?5a?rUSNsO1x)7;R)SiJ%ktq< zY#TIM^`V6ZV4C5%?Q0milIeQlDJs(D-Yi{#VBO%;cST-jMO1ixI63uF7R??)@T)_OJaW898K|VZg??Oz+HN>S#>!n<8^XcZ!(m(kiT{gOp;mR-2J8{)rMiXo>=Jqxq}PJo%>VcdT$qt{r5dVrXNopEN`{>=6w5%S3wlMbaI z9)-{i6r?sE*nhX7?y=+L+PT#F3v zPkC8V?04IT6*aEu3n(E}v{Fti=l+GmkSHp*a%LX%#9`j?IZhrJvE$U2bfZ-3M+&r7 zYM$sb?w1umUmk=NF1=joI%9(t(Q|*g5NZLoL6iR((NMH|y3XF4{Fg(Ffupq1q;bc9 zP2J;ho@8(UU0Ue(oXp!icds!p^ol3Fwm7e-RAevbEq`i2UswqY^5WMGAWGrv)$HWPE z$kKX=q)&yj8DiSbbz%Ur`hMZt*H8ct2xF>GYd^%t`NyD2gW zU-Z?@5d#{5&Jo((2zhfETI_PNFm&qU$X-=5>s2$ZB=aXcQ8-x{itpw>YmeqF;A^k1|twB-sHkaTtFR1jKatDX?04%zA^^)D-$Uid$(XuqQe04tgrq7kg#$VP0W%WKW?94~D;`1e4a zK5S}LRqRK~t;ocDtV_4$>DVOZU(}1~oEUgGT$g*K&{(X^J>+PYBZX-4U2v4j0z4NL zHVIy(U7k|wz~kCzZ5nelUIw>?O;U&d_C-lNkKOz&hMR`BZ5oBerA=MFWtJP=8`@U5 z53l}hV*9bOjqmB@fdn}Qacs_+7CA++#oshcDD3_L_3m02WM9kMJ+AnNtZmoz#L+sH zYE)I;L@_TA&+L}#A+4trY50{O)0< zly9zSJb_(5ztodKZu}0Pw^quWsb^Y_E#eTep;8ac4*$0yi-kgGOgs_t5>)1wMM@P$ z8D+=*Z^*TwcsddN4?VJ+5^lJUjwMdoHVJ6mU^LJqRNRLejL!f4zMAuc*ljj)mzi-=C%Cyq>Q= z?!eRj6KW!BpD;kfh$lTdnNylrQMypc>14LmqU!#A#K&P#5iRkoY)rS)44kJ+vU-Iw zyDaA3xN#CRN?>N)?l_1PM3tx1u5&Vnr2h${;E9|p_x|axGs>6o(ZI7?gM!ryiLqy2 zUMb$+7M4M}WZ4f97)C?wksDO>8^I>09p|%Uv*ry4RxX88^_(n)o=vsRsx1zFYkdvt zfSb;@qk-V0nef#H?*Ubez#NQ-8h6{#<>C__lvBxvufEGt2fABuGE zT{VetjBYj($+NoTot~RD`@w7wUY@;3bZuO~@PWl<Xf!htm%lr>|ma_V9vN1b_CeECY^GzKI4s_gj{dSn6%7cqI+LZ~4>KP2_WfsS6g`+uK z8XVS~KqeYLX|8uXV9a~=_dbL;^8brI1P@A5HTZ3%;s)k}=|m95mi=6BBOsRp_Br9N zsQ^WiQ?l_}5dZF}iojgF49mCgrUqit@Yj&uJNqrm1DHL)-_f$a$d;oH=-H#+Qh4Z< z6n+m*Q9Bih(zP@DcfJK`{+n;X+Fl`uQS8Rev0l65U!1RmAgF5zPF4E54xSj6aBg{} zgP%!&^r}hfbRN`v%hy;0$Pi*61nU>f()Y}-GXhsJL+N4xo;IRBPs7JhYUvc?BV{>UysXi_8oE<}-+4LXm?!4rvyQ|3W5k;AvK_M5!wM!S3?qFB|`i~K4)Kn*a_S*gy z(K+arho2_5hWb*<=uRC)hdzZp@XN)iUXchFXj7D*>T7bIpr~OLh&p?!C8O#Y=;Ii! z`*7k*r7#}zP9&c0xq~CR6C1xG$q&aFvq041A1IL^%koHs6e9iJ(W)G)AgD5T9NCLpc#$kO17HPEQfLj!WF(SvE+R) zt_SGMayALh*egE#l7(cqqVrWr5uMdmBxdJ)B9xZfm6 zqwT7@qidlUupQLO`UXO|$=nx>uwm`WZ=C#`*)P#qnf2fJ=wqvqf3N8^NW3g}^$lIv zA}O|~1MF>LjV=ITe5+Ia-|=YI55M`7RoRNv5(jWP`WFRTsVc?nqPn=m%BU%ZaE)0? zaLRq`@|3h7^&?+7>wcRY93gA4Q2VXI6lwd!jn=NtgeNNibAYN?mpykcsghl+> z9gNCIi)q%d(~#dJ8haxA-4}k?&)4_@o+5^2OM;**zSDh}216A>%M-IT?n}?XaC=KGV$3&;`E6HS3ou;E6#+2H)#mYSEaRFc|xsXTPmfHnhVB z>vr`#)`kWq^>b3=EEx43qEnUv z7GyR9411=Jy5D0qz!=fvO<9Yqqg`pu24KD%BMyH)`|ntG;c_e0q&?DeYwW~`%8Xl2 zvtDiQ`=fcvC9Pb<#dga}&g~+^_O5p!Ap5VX2)8y>&N@oRnnyRYSXtF_jZf;ftat!j zPX7_YmmOwJ4DZUOgcc50vV&Bu_N*I4tvKQ66{|SZ`=jocvq5OoXp+W0vDS~{(tTfu z_x@Tw%ZPz~LO^L*WQ>9cOI;iePl*5vJ6Lg$imuELjZvxLHj&1Y%R~vcB_gp=Ova$# zoP+s72!O7hOIJXyz@c|^zmlY_8g9T;K@eo!6fVdPDz5XFJsf~rv%*h^x;;QlaK+7l zh)ZGV!Vx4mszA2L2E;-EJCZ3R$!B&_m5kQa(FLa@5kd1V6Nz?!^2&ZSwPc zd%@EBTYNQaS@WfIFB)O$H4p1+Xw{ErGAGxQqe-hC>&*s+K|LcAsK&bd?*}KW6FFtaH#$>tI;S~2(Ra7iQ1YL}vH#c`mBd4~Rn0KKpm$urPtCroH}6%PR? z>T^ww9vbucM6YAz=_z<$?<*FeH`k~}iaen2Z%?KT$;43PXXCB|a|fwag_S|Zi8#s* z6cdK5E>0onneRkgm0FFkx5${&R%fu>x60%EAp0d}R0h9Xuh9X*p{%?z=B`uyENM?X zv(6*AQyngA&Be$)JlgL=0VoES2Z!#|aO=EO-1F2)pOOHyZ~L4#C&mPp9q-{iK?1E( z%gH-k&pvO4Hk~!DZrnM^@(x_?fgY~e_6@2Hu(B7{!1+p{}gJ3#(ucoL_83EzzuyhfY z64di|D68PxuZ!ZUp`{zLrKYzI4y@K|f8ccvlMQg!oh7Y)YTu+4zj?OiYcToV!u8m+ z4_r?hY+EvHP9X$-UK8l`>3-I1R$%7WukN|F@vjbZOvX3g#uF~N!L*b5Ml?p8YqOmu zTH^Sy$8E(kX+h_cy>K>vmTuxFrvm<1cII$41)UCwYAF%#2QR=>gsuqf`@AxGMB6iX zyGXn;!PVwH&eHMFo?#f9!#B%>_4)C2Ad>jfbOyKDe6~xqHv5I(qej_qO9aFsFvt(G zuDK%QX0qC8AJJBB0avtp-a5IY5AtF)y4eaj*@>$c(>iH+xb_fnDFPlvBhe3HHOy0| z&gi`rz}t&iQ>sab7yDT8<3xxVL&8O#%Mmp4kh{8n@j3gohDh7Kq3)AXkL|lIEx}zQ zOrqe?qt1;^$pN_3L}9#`-_8EmsE7+y_4B*}N}8TBv%)&&*OHAXWh0P_Td|7{X~P`k z5gjKgjeC>EjVe}qEZeHMX^!5HabFA)E=yAq!+31_qz?x#0`4 zNg(iRqTqqBY~1@&ph*9JdVSBWz@|LNS+xk8**k+V391nJ*;*JulBy+*+rFR{g5a&%IAdEpy5`swnGJIC@Bv_^7`+|uaJ5^)yUSf4!SZ@>K9 zdveWdNHq4d@u)y(X0kTlr2YzP!lI-6!njdg?fj2J+4b`p`8k1;0Afx;GbFZbntKd) z52x>fcr;j-(5#D8WL`j$QciwdJpHvxa^$Xt9oXKD;4zI+&zV-6{7V0V7DyWXx8!A$(dKw-$2by7w{pny@5%1>OA!(X-PTC1W zsb$V)6J;AF8>j?L>zWom5z-M5OwT`KTFk!)KxMed2tEVJ?mk3pB{3TZ`&=TD9g5W7 z&IX{n?_5ki+h&d1ec`=TlCa07IM zNNbyDbjiDDfZOM31m-&sagqTS3!XCAbQ$?laE%RCK3ULlt!rMtWTHC z*bZp`FbZ5VE!|b^c4UbfUeZ!QWt{q*`i!=o#itl8_q5CUV^MWXXT)V< zjMJbA&FIoevRH{9WI*|OJdQm9LO6@AXaTKSn^^L0bEqgk;QV0U`Vk^9wO9|W&AVFt z&QqNqG`7nYar2ZPuDBn?=U}dMX}WSjM3FSG;i%d+^^Ddt#o?hDbUsi+yEq6H?|?F% zjFZqLKHpF~cI1WK+!u=_iZ$31vkFY=h2Y`(Cl#0Yzid*hW-C){oQdSD>d}@{?wSe- zyslbv()^4J=8NvIfn~300|hjChAQi>+$P?l`4hh024OYju1Y`j!Y-eOtgEY9BQhw9 z?EJ@Ar4Ra|$IaX6!QvUeGd20xuBabT=P5~#GXK@mQR}y;$`=2k3JwO5e9b)H-3);2 zXORI1C~QHHG@MBewb?ap-*jaMZ?6z+e-1=%?79!TW|p9agPvZSN%wpV2c9tJ0n&+A ztUBHb@2&}Ji6#Ae&425ER`a{A18YR?Ipf~w8LU%;93Gt2;>Bz~D*g!JKpuEAZlqW9yP>_lHg@N zD5-1PsFh>HLE%#yV#I^hk^9MVCB77K{Hxhw)@L{CJki;oM`sbpK0g0@ zhp@7am581U2bQnoAPzd9CTCLfob|_U@P0tz2@V{%jn3*Ce$-i;D}U9ac-ptH(CnI5 z8T426f02H+SRcR~gb0%ggeP!o;YFJ@^Ht>4*rLZj$p1^*usoNi(SMtRdqO=3UG2XNe{rgr;}5CkfjY2Q z8k)abX)?&c5whz(f-7p5MzzQGzBZ>?)Dh87wOs0^wQoSyyD);Zob4kcBmHNp2KUDw z5H1}xW<*TIU=Sm8j;T?lM~QJwGafw_#sO%`kmEOPfv3wfI17oN=`>mEwkP$5Zt;nk ztgC+=MI~l9fIEVQV*9B=EL-C& ztb^Tyg`HcX0eR9gB-%J*Y53`7B>m*BYZGZ`{Xw2_5h^1$sX6NLSai1T)T0W4&+;)) zabRN8VZn^n*@5}Xd)n0UIrg}XgAU_+X4b~i=QfrBc}Wzc+GV@MEb!nB(&Ma4|9DB3 z&)Q46Vf$V5MAu$%jdkw2O4@a2XbQMkTKDgg&)Fkp*v0Y{wIazJJa|igE&F3QcbP8+ z8fY5krO0`MCieOhjFMHsP|N(~!)D+v?r{AH6oq{yK8bgBSk1fJ|722o>rT)ptw`$h_N)y(E(Y z>KU~YE_QYkx7yyJjjXAXLXEhEluJ2_C^0)D>Bx_L9_q5%gU7VzqIb9K;b6andiTU; zjW{zCQO`^sqy^lXn&bE}_KV25sfzu3;eHnvxv@MyIQ*w8_4TW$gdMl>Xhd169&g;l zpE#o#kc!BBcCL$4xv#sRJwEnYmnFt^J>)YlfBVn{axL_wi&;f7Ay@hs_EWX+_>3np z(g{n1%-L>MJBR(;{Cjd}!l#{3#7Cc=wGH10NkQGoC9-bu0IReMaNZ-naWk;q+}sl~ zX>vVCdNB3z=as4Jgp=|rmZ2X)ogCfkn8*)53KwA6t?_X)|Ek6D91o7%@K;YBo!s_? zSmc-!@{`-30?^m|-4Q@@#ZLI!-rK0KoCr(WLrkfJjq#d0E+Aqi6J12 z@9ys2f&KA;YS8NxbCW@EhL7E{68;lU`={*P+L1_>mjz`#;iY%)^?pHMP*C{c`)*H9 zdw6&R2a%CK5fGRJhip@Dp_7p9?d>!lDO%gwf|@0dRV>fcNYtGdg4^4X)SFcOMAroP z2s%9CPdXXyV_{*LU-afEx6pg_5dl@a>nw^!Ul%*taFWyp>|J%*ae8~K%e&Wo>(}qLk!>O{0hR&_=!mMX(CwlsOlrwMkYQ<`In`$SC z&zqKdHqXq8Vu&V)+OnB0t3s6$G(T$G-0D_I$%0L#{~$g2HZ8wTKp>`W50OIi6Eb*+ zZur~}Ip%OFYW>=?StWAqNgAou@h5&0aX$q;;Bys7J<}}yz_^%|MP!YHS9!E*y69e& z?HZAwVyl0$H;!1Xt@$25Wp^Lurq5JB)tQPaoel1H4kpC3m(qmoQvb{88WtFeVSNpg z(x3G3{YE@aV?f$5JdBt6ZJ=*cqn<~W03Pbj&tTc5FRvWQtDZHOa8$@fB6s)&cbfK- z^o|7^1CrBIXN4YiaTXSNYsdsqD0t+N4L~q({2{jbZ>lOEQ0NhvIH&=wM2HdxOQMHw z194=n%FRIk$G8HN&3nt!cjvjDia%Ss{?j^h$sz%|R8vnVXWv^nEXKqrJOqELTGrfq zMj^fYX1RAQbp*z!1=bj)nc`?|>s8RXxmCq&=e8og!G9*6xm~Q)1FM<`4b&9u?(RnV zWqdV>et|E*ZV}NcF3(V_8cuvQ(Pi9rNqRa=N9WAm%Oy@pyVaO{CXiBGizQF}OPb0i z;iX{xOS#YySb=w32MDp=W+stzr8w#K-hPwFtFtlIAPIH=j?6m731;Q+ywqv*kHQ~GzP_mD7qR17 zB63V1%hcC~Zmmu1FETYdwwJ{_OaxcCW+}0DIh0@8#;9qkA6J)@GEq7nODGbaG2Phu zrm3Nis!C5d82oZN#{J4*w9ZVV_gk$CEjxLh@&^G~r z_lweJC4+^P=#gxrPVH(J!$DyNLp4NBW@n0OD_I)^oMm%N1@D2%p`3Q+TBK7uL|Hmj zwAb|-U^?^Jo2Y8{<5vyT?=Tx@)D~%#r{HUZBZbeC$l*oxm7~HX8hFI!3i}iIj+YFr&qBPWxS6t1l^mL&VAH_J| zrK4v!2~kH(@YBJJmTmvsp~%lg#kFxpqg5)a{#L^;#>(k#B)1>>qfI(tY`L6frz<(@ znwn#`*n%qsrR8!b$CqdaHN2-Ds?A^AQFV|y~}EQujq66V$_cC%OcYyrbmfu15_B$5x-&@8J<*Jzll zz!W_tm(u%APlY{(N4$F+>AO$?XM5h@47SvZt#n^M7}t*0DtB?|DOe}WsBu*6+gO=z zl53R}I#X_-X0BAFHSCk~)hM!Lg(+IdlZTcL<_W3|%2+t)MyO9l*ox=*4j5RlIi7xM z>)%Q{*BZhfyR>dOcspjh&upa*?F^E0Sci`43fJ})$uv-Zhd)B>5})?7bHIOoe<-U} z_=57ob`a9wlmB^K&O&8N_~3HtHQNSlEO*O^^CnniQc-Wnc(Iu+?Y1Z{4aHDzJGLx; zcs$ZTEqkXnCpNq%L~8Y?pTlH@AR1Y7PA*#+5o_O~v3NqMqFdMbPq|LqahXjW?7TOMt=09cEWU6$Uw~KM=WNt~X)~Qu<4sdu&CoSA{W93ET zab((?=376lebMT~q-)=n&Nr-l&ai@B>!^ac!-tGFU+3h;#qWnjokyeYjw;6qitI~U ztZ0Gx&R&xAh<#Tm%Mnz7X=vFf_py*aEI)_^r@!E%Z_RJf0Dmp97`?^#&^a}})H1&b z)&eET&ukTZyWN|r47&`=!AJa)xpV zMRhUx7>Dwh13g>KkGTT1kL+hK{ddK|*9C-7M_~A_w)w8mYz;f6D%0t6czcA({49mwe7WS57G36A%lgNpw zmSRDn`K^H)m|2-DP-=BSdZi-_3uoDg#NHs+nRgyp1dYeZxY%1q4Ktp^V=AsD`o`M@ zo_!jAnvFAt<>r87lR`wUuVcjlzrlBJf@n2a@Ws?jt6%s`>9)QA8PgDZS3ZzNbL{A4 z_%v58i~3$#K{x?tx00sf%XtF)kBhluV|&kaTM6ei!;R!4C%#rJ41_bG#TqY`m*ce8 zQ>@RlW4SUNj)tVlFU@w|hM^-{Zz`H_m8duI$W`If^k4+Zf&*b|wSjFBGKW2wN-E$7 zkmn=S}@d^X$gb+?7$B_Vy4VnaaL=W-Cso*L0$UUvsyz{6*~{P z>a55@%qH+rWd;2UDZ-prW9p} z>~O-TGl#K50av8OX0N*^+*?k;v?oy{(tNPOdFe};oLKnOBUIT=$(?#U{Lf687r&n9 znWr`-?2$iKKHgVnoc2&$2u3cu+GWxgPrGVI8v7bTk1d3xxnmiCN%!~f3m zRHt!mIPqBaEa_rU&970>glgu@*H)YIyE%OYJ?a=GMC6A@ErCIqBLejm`=W*grz`QN z6gDjn1AAHBXSx;HG-U~K6Uv`-n>w-w_Cow4RgO(sGyVOu@gs^Rdy>ods=0uv>idEi z*S!?c(#uV2ky6xZPugfjoLrzO{m_-v*z&%b?j6ad9q^6}CgqLGJOMI$RkhLh#EZqH zp5#AoEM&=~3w@C@0|M9pqt9<0S?+`d-_}eBEi?nw^68O&A1=iqMBLg@7pv>J-li{h zd!OneTwM7UG3SY5f{uc1EbR2Uj>!(iL5t_28SPNYnLGAHnTxJ(GLI44uf@Cu>P;}>D$o|dw*CH9vy zXa1pL7Pvpz{Lt^U1D_#O%+aD*iv7%lg=jiJ`1P{%yo*=J-+6@>ak<%bwF-uf z^@UXVe+ai5nU+=5l)-(YP9>I`-;>ppVYCB%nc#!=zn4Xqref60g#|WS@P*Drlk&*< z@$0p+-&=b7lF;OmK*d79Vxm?+`20*dTS4ollQ~TBG|J^s(}rf~Rve*hceUqZx{?g7 zp9znJFH*{N{BvF0rI&W}K^*%pO%cw#yu8lKyw1+N&bRiW62xs7 zzAG^{+&urvH<`K@$HaMHQ<)>9ot6W$iz_X`Z1;$a|ihMqv*%vlwVQPNBO1HNw z4otMQA7q!W?ez}f?tUD8h$DXhWL?&MUUSApeAA-Qk&D7JNvudBoJ{hzOoAKoj8VEv zGV<1<*OD})jZNP2m^LAAcxZyBPDfe~)wc8M0W}D1S8)`d;0U3NmJ3Ejg}xc@jdTcZ zU8W*59+0{>&d=*$ru61>RUPf_=Oe>=qwr5`H~ZJ*$Ii8I;=F3EifXi69SatwV|WXw zhgGx#$3)&%cQ%qk4i(ipEK$zs&vwjJypm1Lm#?me!mE{bQ~kI+@y3Rwd@%l_7?|(; zcDzj%bJ}8invW!*=U!RMSE=c6GQ|+N9+5~=-4x%#nUc98WzHiT5qy;?0_<^V!aURH zeB&eO*VUc?^%pj2Ab7#mLk1QKnP zv75Wqft!1$AdWWazCNSSJ%MeiG(*aEJiPum5bbhd?!5EBz<@LN{0P^;amjpVX?{jt z=C`L^N%3>fr)r_f7M=%t%ch!g0n27tRmh48!_?K?`l0UY0+y(w=JFGVjgi9I;|0=t zkyZ?EYoe>SlblAKwx~YIO|E&dFKUNTR*5^G)Q2hS&Fsfic@Y?8O1aeNjsZK20?$B0 zd+e2)TQypP&q;k*YPH!64tOwm(yRm|mTjK7mpCDBo_zlCc?FKi<6+y)JWtp#q8g7Y zyHUA6Dl}`udjfm~D=2xeY;!BPI<5g0$ub|=bvQm;NPj*ek=C6YCFt~RvC*pGQOx*K zg3XB@W_Ur>iTUNI^9gI-7C<37LBS`VQY$vMGT(S~UM4Ov38wpKj665+bK2mY=Q7T4Koiqnm%cb|e=ld)BK7HtqMTLA|M;!fXuVAp)xQxkD^MfICoYucQ#_2<<5_A8w0 z!O32tC!hMRZQ9F0AdqPBz_7^3^rwx5ag6fWA$aqH^8qPwa!1X*_1sjZ$O0a6dA=tc%6Kcs-lgc1hV- W)^4#alH`5$Z%K%pXx^*WAN~(+R4=vw literal 60796 zcmd?RbzD^6+cvssB?b|Z8bUx&=^h#d2}uD7X(^E$x4*UUAX8S7*8%_n z2>7D8fd?L$@n{YL|KQs^R(lKpWzi&;<^ zV{Ym6%8JL!!5Pd20Fqwf;7125H*;n$2YW|XaW5&Be{zU}pZ^}_Wnum&i<_Mki@usB zvz(KQ6|)Er9}gdkG%+(Xv!siqwYZkN!oQ1y-=tVxySX`w^YVIndh&P*@;JHJ@bZg^ ziShCY@CpcUgE_cey&c`my|^7+S^ssB|G194m8*q|t+Si0lOyxr>zcoEa(9zrVfkCp ze@p+e)5^>Czt`mG`tREUZ;3G{r4*WQ&Ii@R8&Che<=E&PW{hCU$|Pi$T>NHmAXl5Tew((mA`g!`**$n z`Q*O~O7i}__y5!u|7z%ej)E;MO)Sa#-=3N@G0smj9PmNjbXGEO1%SKv|GsdPwb*t6 z0C!tcRp%)di(Ow|*Vfh!2nbkOS_%pZ;^yY&;NVzZUS?-!Pf1BpP*C{z@#FP1R#8#$ z^72wrQWDn}dwP2M@ZrOYi;JV9BXxE4oSdAiYpkH4APff6($eDM;zFTN2L}h!)6?wH zSbRS$kw3PkriLX7%Mpvc5r{oIJJZ+KUs+k%-Q7JuKNl4hy}G(mQ&ao-^QV%M(lr)4 zHa51fun-&^%oKrLU0od=9gT^J>F@79IXRh{nnI(|v$M0NrluAa7K@9Ej~+d$tE)>- zPp_=31bfoX&W^0C?1KjnXg*+vhK5W`OroQsIXO9bcz7Bc8-0C!zkdB15)u*_8M(E! zotBoizrQaaATTj8F*!N;`Sa)R-@jA7$GW?@^YQVm?qd%R55vR5t*oqyi;IuXv1Mgt zR0yn&j!u4lzLAj;Gz`0YjD7OtNq2X5U|^uFt?m6Ntdo<|?GWtQ1(qxro1LBg>C>l% zhKBg~_?ItVN=r-MkHJFTVFw2X{rvodg@uQQhY13(1B=*)vDm`GLT_(xzz6H-=y-g5 zEGH*-FA~cTj+Ky*=;`U{>+9Rw+q1K?BM!n23=F(`_pZIYeSUr(iA1)xw(9EYVlbHT z@$t^iPG@Il8ylMsA3l8h_RYY+ATKX(b91wxpkQrnt-88;V`D=_MMYCn^Y`!H{QUf0 zUS1<3BUM#Z&z?OC4GoQmh;VUnF*i3ibO}yOOl)p$4hjw~EiElCFHcTRPDn_|%*(7^Rs@L+P5d&sj`gp^Hh1;$MPc1Od@8DG zfreo1f+A1Pu`D03UrTF0rhK*Z`M~)Biyj!6oLxLSJFoePu59j{TUuRN-<(-kLS_|o z^bIHHl?{$f_5T?ko0+>6jQ#ebVR(G{_r%P>`cKrzO*tPy&4#8G{Ah>akg|P5!gKr7xs1E<;e7$f3JD2GKaOnQVzMcQ*YPHg zQ6S&mt{C+rVv>qDnfktSa3UIo>H6+zH$>j>J9j#_d;pQNlh)JMPokS{LH-P93nwPo z!I3rDTgwb6E^_03;a|WPX=!6_P754D{V*PTIxT)1THq)m50jugc`xiH5BAW!=p0{CW(H&)15U9~HrR$cx3? zDrsg~G-WQOgZwF%F}+oqe`aPc7yxn86sF6YK#X!ZOXOnSz)_UshQH%?}VM zv+Hy?zHAGiV_O4)6xnzcMeK5XSan_E7k2`NzUy0N`#eGR_coshAtgf>*rXioh+lC8 zwS*5uv^lgroRnjAQ)CIN|ZCM(qKfVKm zk9e&Z6;Qp*)LX%@g-`UpP;_eJ=4@S7Ncf2QKp48x7m_kCu#w;ePhvcuWop$g7nFqH zJ#8hZGg|)MDE(DjxhZ=3N7vO4EoiZ()8eyYq@-QEiu<#!j!Ib<7LTbYOG4#x%2hO* z@(KxZv!XpARjG>o>L_TjyJ-0Z*_Opqwgb)ud2JSDiCN}kc=@C7MJGU9d4I{Nh|K^R zfiiYP;-O`gNzgwI{Mo!oY9`f@2GiDZc$vm4nATel!p1+OpRp-(cFeb#TjB+3*fA%^ zCwR0$9mfW1`Be8;%>Eqz&_$Bj+FtC%vQY-uaiHZn6KEI_hFffkeKM=;jlyBBvm#FBwKx zUGm+VDygWbdVgBAzuovQcAJ(#t(I5Ggdok`y>GHb3dt=cOMzZDittRP7VK2MyCfV3 zeTK7JT3o!%u_3-pKHJPD2FuV(wd$B#895VGt}?BiuHdj{B~%tzvyBq#XXJon)71#s zlnIC1YY_PHhJ_XLY_)ri7>g8}BR#8FP6*K>jGPIMY2D-Flpev`4{>g3WNP78|*i1nQW5*_tBH$9F`~_MVEW((DT9ogOu7X{nAX`JBHFv z_SF@4hcDj7F?T9K6y=B0&(E*IC6x9Tv>cMz6l3Me{}6uu+PWZS*FJX-$y$xqJ0(IUwq(%oT8rC-i@qL-47;d>EB!6s|!}C6`Xk@k{jJQSr!81DveAL-lE~> za28b7rGC$Cz|f#&#C+Cvj@3QT;`bT;l3M2Xw+|hHeaMf*OfuZEsPE=@sbi8}&>%dC z99U5j50*v};g}&U!#Q(}l#rYCGBuQddaUIOFi|NlhbU@yY4QM{dKs^>vE0fc<_U{N z_6VuXgv4K*f+QBK%SRb;i!8vWB=~8q`?Xqms~@iq>o&fit;QuR!qIGufxD4yT>O(x z5*;r@BgB#RJIh%ft%}~y#7)+5YJ0zjK}BS2=+KO@Y1-+Cr3L2@pKLvFG%^E1C4Ie* zS?uHk4{aGe;==nQQ259L;1(Z>qi&OqL^e%WqDbRJ4Jb!q}_h9GE06FC@GJ z=krL~{A>P&Ta>MN7<=mkf9v!)I87tP=v(vH(c4rA_bf1@3}ns~9}j*{1cEK!=EUvj zArMejkQ5kGFYg1dtrWeIH~ad_xL(u@XyE&KJ}GgC-ZxR7`(bhvK+DE#8*5LeeZ{MV z(k{B8CnD(WF#%xHuZqvUcujltJKIlYbOeY3T+hD_v>pqZpdd%~dPRhUuM%xe4t$Wv8yBzGdv za{RMl?Y=k!gS;#PtckDg8WwYbTnh*7q&$O&wlszSLN9cLp2`oSzZGc71Z8j`)mE9& z#6WgrmPqO6NH$&|zjcx%kZ*L~6xo1Wulw_+W6oxTmJu;XA1gnUj#ooiBNQiTaYf&X zOd@1S{n|;uoY&AG3CzxDQAa?tlpNMoWY3B>}p}Qr_vM zn;--~{;Z|jwpWkT%e(S5()Y7{NM zbc~L%yii(gLiHDb3qiK%ypZxX%zf9-8Fxh*eL0k?eWok+JvfOCog4*J#T`l#_?ZaE z9IxMUWs?e7sCdd&a=-H~DV?p9^o!;^>5e9#Jf(vXj{p&dtP*o{LjQPT}4G4KejxBoiJSToh!waZ^2K)_V0G zRTcgdhX;KEO!pczkXu_LI>dZ-Qa6wz4&WsF=U=T|&*~@oNxs+oQ=8DOqgIB5m3zsR z*^ln+(^hLj??%6uPT~uP!Cy6@;@tOT3|D|6Lfr9{KX) zyk?u0N5I z0Zt(VT06Qrr!I-SiIIKVH!u7PWQR8!KLr-L+bC+tpTUPW_oF{7F=Ci}K4s6BG+(C> z1U*M?nd+=Oqjzu277+XLD;IqpPoG2&<(v}mt*{A6!OgDbk-{Q!Ri%xu7h+TI{)s>u zm_C{pwF~jln;K^+H{n9|JmY2b$K3LF{qdozV||92o6K|B^K~HjrEp7r6wyr#T7`~k}s*d2=J&@g*G2_l(*Bk^NW>B$f9Q}emCl<9k>P(-ZSHLT`o zUQ{862Euz~bG*Vr@0sq)N2yP-AhTxc&yX^d5;WY64tKQK3H^Y|v*};P0U&8o_0;zx zjr1!d7BlB}Z$%gWUZggzr7vjRAw7hiA5EYHcLU!|IEs$VhA z@A4$$0vcVJ!3{Te){f{9)ym8M)=(bomV zp&b8y6XE>Pa7{&H(Cy&6@@S&4 zvoQl@InO*F0pLOhvuHv>kBjYUVWcwlti>0^Ulw*bfEH8L8;sQi1Z77pm7C)`=h6$# z-{Xx3ch~|)AO;d}SnDGQ*YM&Z)%*Yzy-zCIVvoGoOHt+9SJ~1q3YprzqA(>vQy#U{=+k15zNVz*>NoMRl;unkOeBF8E3I_bIzZl!37s zCH5G+O#)|degqh|LAw?1CJ&qspHLv~lR?XIvp$TsP&L__-L+L$li!S~q(>lDI}JQo za?!PomXV3JlqztV^J8Mdt`s_#&&-a{P$r|S>>Q3r`#qOFBTSC2K=@SSzl^PEIdHBO zE2TVmaY@#KTcN)6+f;e>IVU78Nrre<$Pn9C9}v<&AasAXsbARi=`k+sXJX~UXsp?Y zjw*C}zZ|dT2?X0%KorjktwH$WLG$r{neMkr=?z0VI?TN&oAUbhhpF8NImilOUoR@{ z_HihS6?bBU=@AcU(|*cX;|{7ZTKw$HI7dZB3>#!v|Ljali@v$sVypVs8JE)-*0od@v*+GLXXedT^dur$?K^RFW#2{=hsPk4yyk|BVi;Ytte@}2t$Fl(|U z36PfEIsJGn33R4y5*Te25(FI_Dt4Oxdf|xY9MV7reB_!SHnXFk45qz%1e6+wgA|tC zPXnH$obgl#s*4LtXz?GEu7f(go#FdO-?o~pdY7iFi{?tlkSkR=AK5y^nPKg>8(C2 zd%Ll+2Bdfcz%G$*u*=3RY@38Ryem0*sEp7@1Fz=iG@ryDY8-&tn~J8CI4TJi@%rb< z9RypEy7dDHN8!)oRfclwh0+v7Kyp^&!<&6Ob~Zd&W_2iI^^A)8>|Tup!kkEN_jSG* zAz;=J8s6f~&a)-jo^?41e=Y7yD(hn7ZhTi+x`QhB9v4IzVh}QaElL~7#?W%30ta9k zC-J*6EtT-%)?LK1aU>HkOI%PGEbm&Eli)A=?uFMKN(6=4X~w*>r7LyC@zb`fS__|A zDIY}PAJ~uB9B_W4Hw)$bj?OA6;t_#p5MWdm@@wEU9?k@wzsz8z|$Ab}bK` zI63~ATl5I>cRS;~5O=qKdRBUQx<3Hu5mE<>j|ly}_Q57)dzN2LnH*UE<*=nHoF*f{cz$X~E#kc2*cK>pWXG7>Nh0BCUyPKR{-0 z(pTfHviSy2an0Zdoretuen8P~KLBDH-U+raHN=scP4$`pdP%XQW*MMO_@`ZY7!Bw= zaqq>9#R8uTqHwTiOl6x0xa{4|=wgrd?WK{Vq(W%mEZ*Ox22Mk6dc)`W=4Xkw?~qoA zQh)PAhRZ%IzyS`Zg55p-F^{}wA4ko7(PEYxoY#EMxGEigcuV%oNSeL1AW&*l6$eHE z`YmIp*XnN}1Fqy_-U!j_dk{rRm{ioBEqes6ws0C_XW7BB8eIvx`Pc4`#J@J;cQrR< z!{n(O$Kn4rQJ)XHgs*kreg8Z9Pfs9OPgQ8A`R-(Wt3yff=%g3bUh&`7(}b$C^I<#Q z9gmRm;Hnv@KEqKLD)&kU7puN`zL@?FF^=%APT60O@~q%AFK7c<{lj1j*@t4~3-{CA z02(GGX73|bU!239SX4cwzUK+k{H;v7uc;4qUm2Z@Dt^F@e3s$r}V^?HTl{P;KaQH>ZBqlnkfz`>65rhUWmMx3HPN&=leqcczaTM|Jq4IIq90|E+EJ!kV4D_D;{L%XVF1h z0?4eyO=dvLwt2X^XSPv-2R)U>9!TyJ&P3fYsy#|gz07bZw_nW1wzs@9@!Kw&dRxR$C~pYn3; zBe95d^c$DXD~j@8i5)+4H1L}nTi8X{W)5L4{u1wGtSLR5xf!^Fb`~~u8~7w9m}V-G z{?Q}T{P#Bcbxlv5xW?x*$x~1CHO3RMj@K8NZQUb=8k{l*)QS6k<|Ks2@25YdZu&N; zQfj#nkDwhdvDZ;Qy-)`I+dB$eCWNQjK#gVDBi{1Dx zh9M9xgW-%RwApmM+t!WtxzUP)qTxy8@tfu-PJ;uVA0&4WriK~UtN;t~ZHJH#50rjT zv6OF11r`_m--B|4YLV~{eh82;vtaF@VW7VA|0xwx+dwdvkGcuf7z6g*Ma_2R4lVs7 zS@yAsEniE@?FWPeo3;trkcLOX#khKJ-hevV-?um5&)?r*kFW<2Dg?d89wnP@6y50z_v$SbqN~AHEEbpeyG*=jr&sk z^Vf%^M$qLuJ5qN|wr_G1e=f|1S%tth#Y~_c4+n?&&!?!aJaBa4RfF&YqHn&95uWu)ZCaYHuuNm2mB5taFZx~dvW~hSz zU8%Or-do?}?aCNlFcL@v@m^`KSVhlX}HUuEUHLQi7lywVHk*BS5 z&Pk()VVN^(BotCnz3D6&WgsMs;fq?oOmuEi%^OZ$^Qh|{eF<$UVpT2|r)@b+Gx+Q) z1I(7cu=ipJLp7fDWD?DG^I?8HATndEf7!c#6HsyxA;~y^FO*uez1(?%6ZFbX&5zvm zg=hna`0`o8*~X7%UY=zfF(FRQhesC_REc&KuKO> ze9gm&x8F@-t4a7SV)4&Q3`$7grYwg+qu0d=y(K+h#O5ipFSWsf!CxqaWsXMS-B2yH zvc~i#(;X84^yo{3G=_q=Iub95T~OmRgDhrmzSJ@)(xKvDhoj4;#hScPx#C1TeVHmN z(sF|-P-9|VhabM{u_cV0l%6W4VZj<<;Wg^Mmk`Bfs^H1z$!UTPR@nB)F!Icy}Lno6YTH&@S1}5 z)5^)$;V^oIo#i5(gq^9NVTZmipC+gKdXU0Yh-on(=;eNAPU4k47r+xk4FIDy&-!t0 zs;+;tz%T+q=`WVQxJup}>v{vUaCzVIQs7D9kkf5|rCH>PIcXXCP$Vrd94w(M!KY4i zt2yif&>T>g0L+NOM*^XYdsV6Qu?GWk`;oQHS0);>mZJ2)rqtv!>t6p4epe0Ef1n#r zGpjwRL|E*_&xGxBmVcC^2ASv)YoX6)tn3Bz^l>92$xkwbNf)fNOd87VZ?p>G{}P;b z#H57DZP=H`%HRRN!uqegeCU&jKOv7Ox-$w{s{z0TT?=Z>o16CJ8vwl+9#Er_0Yt^r zzvT$Dhvxop_R0S=`D9}v>!T#WSYdcO>_t+E?}P$! zMjxcjJ-6^ZYgFbXbzhA#d$GFB_}nhG6PtP6S7cZ+^@gN3rwuyUs6cm@Y{BV(S&&q* z+MFiT+$c&ffh?5Bt`Ovh_nsYJeDc6KGsFBqasyZI%OPT1T=)!d-xqzezU4k8O$ zVL}nz%~Knjf)R%>t5{)51fJ|ebm2|kcv=Lx7dJb^TjWoe(8QKU{d6DU1$D>VgR=)L zkRoTUaF7pwKb2nK+h7MrTW8N$eF_nv*4x_;%<{U9u^fs+_&3A>A$+1cO-Ex{-lmZY z+T}%u-|V_$1MI$wi9-VUBGP4>2VJCS5W|4xj04-(j)bQpsy5z)V%*RSE?^fgnYbq= znstFsQ>`M~P!}~+TK#3-d8yCE@$SWfdxGM-bSu{v-7oh1JF;r?i-r1<%c@1^<#1g2 zUacLuT05QHJ~MZpA5cXluMjXXG;g=^>W(uBYplNE3kA{yBNTO^$I1DgEqvSU2~Ty! zAc?tJJhQmtHvt+J?ck?Fn~Txp9(VC>+RI(N7k(3O-J9MPiLzPb1rk!9mAO`*ZE#ei zlP{>{d?k99?E{JnSq3w0g_35JlZhFF<3pdWHu2B!ewuo!?Xm^2H5tVhdBV;aNSGi2 z6No7iBAK+l-tmCl;b52^qQV+d7aA#{3!M?gBnd=lms!QWh?1KDf;z$J;!Tl+8M|GV z&Md!y-Ybt@FHSFvIVWWb5MfZ3B(AALMTuyocQE2un0dWW%@gW3Jw>sjK+g$D%%8uA zklXNh_qd#9tj%HMVpNpOFgwp0N@&_iT2YMWKk{R!I$EyTf<0z9E@CmGD40BvZ)RAK z-p2YY-u#DggV){{bDDT-=(TNAtOX58nbr6$VwJqNfsDLYKRV`PPI#yM1r)D|7k~3% z+xmQoZugYQu_5c3UfexL3*l!zJCldV5FY#LYriaOc2?ZYZP|sr&-`IpYv?O{?J!m) z?c!4<+4!N5K*U%7K>KOu?2yBlE|f>R{e%(Wi+WY!^44-X9K+bwHs`Q^C#7rrMVK7I zVor5yvy+@j+V3>R%aX$}(2L2}9pZ* zcZ)h3&b<;zTk58>kAi6t&iaY$8M&HJmXFY91j~~}C+f=}C0{o6TZ|JD30ImwS~kdi*9V)085Db%rOPJ-bn10q|ng>`Qx zn@3AO%xS>L7%+tn&JyoHcf!}XcC>B07rwdFPZQzH)=aOt?HI{%Z!BxX)Q9N;W9dO z9ZB96y|}=qW}$K|i|%rMXulh^mB-?gv-mWT=v0TEfYw&1^qnCXd}8Phoc=g?K8%g` z;N*R$$!+iE`90p=GVO;ghRDW(9~2G{5XmKFK(2mg1-3@CPtK{_r&CyM?KfGlO7@;~ z5A-;jeQ!>R%WpNVG>Q^b4q}=Q+6ojz;Di1CuPgl=7 zI3cT9vzlae*u*Gg7>1$LS~NBZoj*;Te~Yq=^dQuSr|1X%LEmf~48*_eR4|AIk01Hc+a0zN{wn zzieE#q9p=Se5a_fqb7x>Fhi<^Km%;$w}D|F2qL=~8y$FB#-3^pC5&q1>5nu`AanD7 z?fHmo$uv%0z4+x)XTaIc+vE>fDs5Gngs$E3P8HU;SU z`}KiI_{gbh^brXCDZ+U3tf%+5 znGl12@~#^qEw9P*M-*KceUzGN0l;f^+B{%Ay_y!-9aI}~wbF|`*==s_{11HJ44ZPG zks-oLo6$(P`*bJfWKD0m*@#bX?^oVMIBh5)3)Knvq;2l5&X5B&_EhvHh4O!V zQ-PjpfnPXdUB$W;ez<^EYVX?>zbV)zP1#7jCh&ReB_#%B^oNWW%?bbB#iH!dSgqS+ zQwxj8Z^C?^E2ly@SeIGf1-{#g%8c}20JHi^=iB3Fl?%Hjf#M@pP@mL0i2df{Ph0AF z^;MF?YdwcWWiYmqk#ok}SY<#Qz1}|AjA5PxW?5=4Vs{Lv&p>BX`&okrRMzY*x8Q|i z-|tdhU`%OQ=I|`atJVS9B9!d?^Ta+Vf*NrjWpAIF9w_rz8c)P`RV}4)bWNz_h69@+ z_SARiZeG~OchzC8(-nLbSh%<7#5!&upY8CWf;Gy3NeEwVx6P?*8fFeR=#33Zlc+P&YKrVL6 z7E4D%UlK<(ZKLEd)bQ<9carABd8fmLkIi7@@dX zC_tUQpxmOMiw9Vv-?a$3PynTzoF>pd{^2yyeD1~@J$q4S`{w0%rLlM)Oxyi|2d}G# z?1;j_XsFe7YGpw)-WWAm76TAk=fFW%DB_Igj+UtuB-(3|LLz1D#0Qt7ejG5{8P^rC z+m>70SZOR;*5^HMXe^*WH1d)zBz}OYrsspR;qtmMa6?E$+USQb8R`H{*OY`7CAC;o z`u4P>5>qh)P~;{W$iexM1>}t)loUEd9!*HJrW}5 zi2MlNhs{Y9%_#019A=$F_Y4>#%`W)EmC1kd<7nF|B-nn3xc9+D-AqYQ-~PRg62BS1 z+5ADbI94n8jn9`(+VNM%fy@7UdN3eq!KbcnVG9D{xw$c!HgmTl@}IgtZYr@AksLlQ zhlxXM(8JdJyW9(WuY_DF(OHN255F2&{G8o5laX4MQOz3dp76d6>~hk37C(cs4Dva? zZ#U03x6Ih%W)z5+77Oa-i-<)mEh*APUcz0oB$)g&w`n?_=L8oTfcFA?LiVJT>LtTZ zgGOJ?$Jl5WDP>PPR!?r(mj;?cqma=o^8>`4kmhDFfyPRX0};sxa{FWUgSiDLE?J`9K?cLsp&Egmr#vPsfM*O9LYl@^W&1%N|( z1r{^Gi1#Xha~GdN@QP4R>m*AyyBovWWYF9a1w00b~=2h;AuEunK}!Wn&#xHMmQIz|A(svsyD>&q&AhmVLg z!9>R9IDt2oN%tY7tajR~R)z*4Ll;`B;VRYf0$i_`_4m0^r}$iS9b!NW0jlO}ocf3( zlt(Scs3pV?J%f!E0*~SH0>{n1; z+7H)SXTM!z=K1Q*U%HrDLKWWk`!C}GT9cq@ahBY`6vJ5poQ7f`b*$+yqAwwL(}tsq z>DG0f-CGVEw!Nj1KVfodA`w}*oDdZMs)n7?_?v1#N1?td!-MZ(A5pXEJjHQvryqNU z-isT{QC^}G`ZhfRFD_l73VO|Ps@*t2Kfio=Sz+gM6PQUg9+tP8_l^Nov?4kWCRh^| zjo)kqRfqrwhseg&Mrvr$V+}#byAq2~gVA7VyO*{v=5L*lu?~iz-mt=?_u|ZdqrmpW zlS%~*-fhwUg8oFdlQ@(=o5_&VTv7&Gydv?;?1^p06%+1}DLa@6#&_Mk^>J}qF+LcQl!$7y_G=a!J${I%cSJ&CCMDues5#jT-Ek%r5qYe^JU{=Z+5rzQ`f>8uhEr4 zq5DV@z$I)aWZETGThes?eE!jsd7WEjY5?*iS~pbY_S^*1^M#E0ffAjwT*nlrJ4h-F z*WYn3SC-fj3kAIK%HUpw@c}OG8A0Uj=D$FetwNfe#dxEBkX;hunc~Tc+kA)W&m)LP zLlo#74Uk+3om`WwnGp!@R$JlG@2gk^+3##Zq+o<$s zI5Wsi<8&vXI|#4oR~`2P_((6}>YI%q)1eHEh*K1wkH$1l)7YDa-_fkZ;+yK#S6~g- z{tFKEypM{iO16kIIAVA^p4GpkZMe5@8ik6HGlwVLIg81l@|E5g9o+^a$DSY&dMotC zVRCGA&&k)w49V9(t+JroM|2~hT(;-h#Tg&|?%^aOx~hXSX-iP>+g^brnI!dI%dEzx z!shE-)7&>UxTP07{N(u z$2HDZ49LzforRivl`!hvz5#&ju3mJ$a!&kr+;$0FU7E?IgIy$DFX=C-?e6gUh9eRi zJH`FwK`?@uIGV8`o(y9u{jHH1oA7nEKjGsCd$WJoiNVEVd65Q1Hu_(cO!Ahf3wUQs zZ07_S%m3?C-j^=scTm7ZFXVmt(WCrBA<<3M^S=^=pq#{6{o=Dem0lIK-tO*->%^48 zbe1r=_PLI@sS4MCH-8hR2^1P~b90Z?dNI-oc?_(7&l75w<_-VLFBs7LkH7$uGR&T9 z+`gu?ws%FNF8sk?rh+%-3-k9s5;|cMyNvp`6p?UyFi;)y&Ci5xVp@GAr5lCMj`QC+ zj49D4A%cW(RN8!F=g)4btfx4i-0$QS(jclDYeDDiL9Ka7-El-{5nD;k&`~-A(Z6Ym z0dwqBU9rggr)G>f;pel!sDOihCM}CZDGR6SwZp_4O{&$yTON+4zHoi-?ARW0x9OhD zWJ2U<@ypmxetUH2&|+DAXy1zOffL?qA4O!ZC=T=N5s1G=V=~??6X98Ke?7U(07dOx zA0|e>$0Mhm#u}2ABMeNyX=6r18!6J0y?4plx1wv?GamCNIHrTaJV&+s6+_2dtQo0z zE_Rpql8y^@*>7=ItyeT6TO51;d}1Jd%|qh$%;;_z&Jr?HZ(fLBjlR1G3%_&yna~QdemGs&z+qPuJdej8s0m=?Er_@p5p11 zOQ0{r)nUmT7hO=QT%Lm0rx=_g?`@}_H+VRBWGiuvo zLSjT83aJ4kPhuo+0uRpMK<$|--tMPENunUV#X?%dNfmr-a~zom%#s=qk9Ccxj4Ula z&{P(|2Soy2Y6Pt(j%w^oBj7S2X4Wy+yuCt{Wepk|B!>nRgd+7EX{(vo!36{Rl!Fq4b_$3L^S+|tz&?Qj1 zYP3{qH}`}9pA#@seRlL>$_g0%auY9LL67%BGdy!lx={F1xz(l^y0D&nt< z03WlCN*U?FZDEhniM0pREA_LhcUVoZuZ>d|*pq9-^$=H(K+sioYUg+PtElSQss!R2 zDFot3%rdnTO_U?}{82T~5-$2D4tlQ@Y8y7LCf2!_%CCQ|kO9=8<%I{lRNrzrzFVkO zCAsz;kXpQpR3!p~)-~P-mf9u5RIBHmxmX~Sh4njkyDVB>-Bljpo|?P?SJIf@7!Ofc z-Vw!`&uX6kzN1ylZ@znUnaOy-lsm9twIz8IIE{~#&@*Q}e^+gSod_Op{mltMVCF#i z^qP_ZA&r8C{q|dU!v`T-*ylYFzaV9mQa7;dZJPr$p9$FgGy%70;F*Mfl*>@7EPnPH zBn2K4^qW{8==C#3JTsLvJGM2cMx6QoCmErWtLf}d)|z&(!*UR)AuNe^%g}VFf5yD7 zWsE@C0uOdXZiD_q7Q5UE$#-dYAsTmq(_mEs>#ojNIL$ZqJe<#kp*efFjt4d#2@9R( zr4>fK(sqLul`kwUW-p@UT%GEMU^d$y8!4EGlipFNlw=C}b|-d`7f>Ru0>Fuz-$Lbj zRlg?Ev=Fu#9VVyujvs$E&0Mm4zO-Mp5ib?}9_!r4F>*N+#w)T5|;Y zl~&LwnbEIv6h1EyvfE~Jgi5mp;*|c>`6U4N0!gS=C%uXaR(>Qv(bq2-#wFC9niNlD z|LS5WX6=80^WS^wB%lGEKM@#m^?N5jTeb5JTmocH*S>+vi(>m+Ve-3T^MxoX#7VDc zF6n}fy@6%?!hmYVKRNo9D+fEXqZ;G6Y?wf_$O@;wPg3Q7p# zXmCqhtdy=hbuin-cpl&!a+$G>GGT`lO_C9!Uzt%>8|;{3gzU)%Qs6Xx41zs%kV6C4 z=a`qxsGX#nhlgLke73Q>La5N}tR-*lM6Yo*3q(|fQ}db#k!BA?pk#VQT}UnNRe>GY ze9dyVKPp+njtkzEsPIjQN+2G0C3YCxt{X5UCvNBcWcJn`JEbdyEH6w&` zs4hr6@pmLhqvcMOFXHPf_0B@zU0>Wx=k5*FlE9uG&(LA&8l&9N^ba8?q3tG1_~<(7 zW~mP!QIm@gg^EU+?|@>h#a!F*c=TTtVfWr%^zEtZUDLpxvAG&y2YZ^(=ish_i*GQ$ z64;dUrepOH*ABdX*xygBeah?toIQ+>lE75o;3`7O~D{Saw1*u1n6$%MRghi&L6+Zhh3A zg_+ANzVW-NvXc9UnnYiX&kVNnpJ!={;sZCmf{8S{m1;(8hWd}e3>Y@p@qEdxK9DE~ zQa`*lQycnnq&M>sky2GevMUo&sPaVKwWWo?t;?H;I(iX113;-*puZPwjkH(lC%DDT zgabS)+-sKABSF*qPQY#(+P+>Rv@^Mo#w2n0Zf9<>ZBg-YIx#0dcfYI`zbozZs6DJv zY-94RdGTh!kpg5>Orz$4;hb`;F_lo-sY@dByax9!_bC7+!L_-6Sp|HoBBojchXJP*_l)`7XV${pP zdd6cq(^wj2PV-Jy)j*`@C~MfZdCpCO+_AyoWDE#b6?o0f(~r|Y6w;9CPb`-}Ng6{|XYjG8O>B{(R!{v5!(8;u`=bPzNtefO zdYLaZRy`p_rk&Ds(iPf{G>9hzhS~f8r~xouf$p1;5F^gNItBc^+oReQ@Ay-5%kkP! zWW!nU)xmu@;WO9hi+Sus_)*2{f&*Tol_7$xYz)(-dnzlIPvx~>1gfQMbg6c)yF7TG zZ`fbc_whBi^5Ww67Xh1zZ3G4No23CHzCO#^A2(d*ozKcYbWzt|eJu#-Ef@>>lm_Yt ze+7j4b#RyW2420#mbmV*h9?fVMP|f7B9Q zK_`xvaRHCMmC(lZ2|>ClWW)TkQJ2iX37y*%2=k(k6)&Fu`em9})W?YR=Hf3DQtZS1 zxHxj;#?{~OhNS(ZNUtPnZ8L#aO)jsZYs<*aJE`UCFZxK!k*p2t>mH$)29G~sUl+qi zl| zf3ITu6lmEwTD)%^(;#TeBRZM-)T6l|Vsv-wDVvlAZYzw`N$pTq*t>BY2vRQbybCaT zR>nl?iSl|?`yWVU{2R4jwa+=CPQ^}7a8P6U!-h?l69e0B_5TG(y{w??I7Ow(08TTQ zvRX8u$b$M{hhe0oI@JAq5ZmolAmjn{64-LONEhmfVf#csFy>B(-vY4wYVU*JAM zJv)E$;Vw2H-STLD3*!{y0651^U3HxlhWk-QpyvHdBp}RutqIBgqBd{Fr~!Mn1EzC7 z#fB*nh*?UH=FEzC zHKHl@#A#^r@nl$ds<%@+vTPYTvgUZSp}(+f9K%~~PNF3g zg8tz8M`Q$KfS$LY=Cey!(3D>;j@L7v47?UjmyrmR?qo6CM~}^be*70}fD33hqEQNLJ# z#`z_PO`NAKc2Sf0(l3 zI%sUs$NxP;EdMN@KZAeylm9;yLN(4X2!Vqw>`hZdfE=|s4rCB-?$R-&80u?nc+s7* zj*FO%neGO8$6C74Fsdy+f4aIW?ac_pRJ>|+A6woM67Ol+x_1^LhG{p^@HOS#!1;7s z9g{3rf~scPt4cEBC2SzN`vgJ67;Z#1oaOXgUH*-_DN)dIXy|0grU^yA;qXtVry@nQ;ozn(dVuK(>+6Pn|K}E@n>4#zZg566dCQn*C>$AUn zl>B~!fxWb2-SGu?4J2+=NZej{ttZ76P?~*Bh4wKEtcY*)r>(k*{6pXIv>^4;aHF*O zhe2NzV%^z@^yofd&mzU0>vs!?G24>L7tqu_!qEMm zW+BkuW&MBA_SRuhwQJw-0t8fK5S0>!7(hfRL25{)TM_A$QfU~vZ-^Puab3n5U*{QVLjcX3wn}f& zNzRBE%N1)ud8DG*(}RR{K-W0TXmW2kK1{03NA(VB0!s4x7+d?@{i0|+u^gJzVmi=6 z8B>Kh=dgZwaqgU+l(Dy5Hfklfh-9vAs1J7syn058O`akA#8H^r&?tW+s{N)hI^|Hp z5pkn^2MGx+pZUZmk%bLAO$`r9W4UE>CfOVe;jOc4wlxN>)KRMh(#mt3n_hvIjX*prj9 zsM$5=j=j{9;3=0Piqh)Jo15RvehUk}Nhv)bFs-PI*N@QvL}hi)mwQ{16TVZ}J&;A; zuZTBlnK%?PDHnX+s7nidqmd;zdarlZ0cHOn@F!}(OQB^xj6JX(SJ$74(~}KF+^`&R zJUBa%j<%6w1AL`vwga9r_i#x$l3^)UzFGSa`CEK=v${?9Zg+i8F{3jL{R;5<0axrE zoFWU=xQ=?oJliKuMRGo`*!*^hvsaqB?TO*SabF7R#+{g00)}EcHsezn!y&ZLJkJM9 zr6heCF9u5Y#Q^l9E{5GV9S|E`xgv4>~Ka8`?sBT zJz|B=Se>zf?!i#u8EVkyzHMJXuIHA`WeC zp5!FE<697s-Dj}P7RPL&3*hC~WaIYHSq3m@!tcD@tEMnUUTmf?qH2CyNIx9O zZA{5qpaoTTB+nJQ7k>C^*i30p$iD3>;;I&!ZpK=iENFewx%=8aKTxyC!5!D^F!8a1 zpcyL~67t|@wwIYv-qn{u?6^8*wtM%{pCI4UAaY{mYW&YiH+K@XUc&32Z%P?{(hHb= zfix59*FI%~=aW9k(Wx5=$$><{DJ-iPzgX6F43en9&-)E0wqM@_%xa`MSkuJ#@(ca5 zo@9$Lpu~jrD+k7gqwf}o={4C?PN0tZ4W^=p6kH|&Us)c7ZSkFqt}oyb822Yns}tlSdS;#_a5!`b!4qf{A)4}!!8S; zn;T{1?gH_0*kuDgx3phX96b$B(4KDZTP7kLDy%Z`vf3rED_L?Ikm+wEUPQ0S6lMpU z?CtbQ?56?!nd8l)Z~arZ)@%WNxF*sy z#@Rss^5bfgKcw#w3})sv_wp5U3Ot`=^fT#;+sk)rbtrV2<8D=TR4{IS)zU@G$ro?y z^MC_49#zMFe%P-)SpKaDQ6GT!(Py1dhK>(~*hp#+4XjL`W1y zUZ$LnMmC4Ko}BCrX*S@P2arKcz$->a^Yi#f@AkA&z!pEvex&c~&l#2T=PMis1ae45 z*nyUJ=trIuxVE7xJ+>MFf!S+(xg@r_af_{PYDC>!uWu7io)F2t%#R4LsyE5}#K}Ur=5=t3>UYKAphiBhs$WH#~ z^6hc)^`jy^Aqs6@Xs%fOZT{jb**OUActuEAS1{Y9O^k*U%a1_%(Ufp=PcdsrUeEA%i5xF21zYJmtbG?EB$~ zQMiuE7_sIQW&X|2^>q&qgb>SH-HA;x^&a`qJ&Ee7ytXPogp;}8lk}%zia7rByAlV+ zQ#83>-t38h!+99kCnWfGt9)glNPMJs!M09kXHkqP{1FpnTPGCsqu0DT07Ju->*2OP zSum$bhqp&N7oGPCfw$)Y4DoE(qIJWrXOSQdGJiri-Wo5ng*qrfI28@7X%GNw^;0&V zN7iKAHKm5AYVcbgEj}m&!xU6+B=}Sn`kgnp--we(r`y;|Gy6Il^hq8cv!5VaKiez&_mIR+h`h@t&=b`n7$XpP(w%!xWzKx{1Qq zBo0*q^o?%I49>Mfo&rJg*v0Yr9`*}+GNmMdVkoZT1(9nTJ9a5^``joSuyNNsY0TOt z*B58QE68ZdqN%B?BhnPmhH)Jp(?G(W;CBqpKMfV9#t#+UH5rP_Z=Ek3kvQtxRGTw0 zkW%>o`l#L}Cb{!J6aMtc`sH$O@iM|`Wd0@;()vgxbG^Z+))xtdc&ZG_5Ti_U0b{lX zW5=4CYO%tpLWSS}@x!qMcSFs*erD2}0`v1v2%B;G_pXCkhyh6+URB!>BH-J+2f!yC zg~d~9+_I_|j1_6*F5q>(Is#{*$jGZN_nqVTDiB7b@BYB*8%wY*mY2;gBaB|2RtfN$ z#?Xn2S1%*f`^lpWI}$hWfO$32g_g0#P(CVT!Dx7nIfH3zl0gZ*FJzdf!?dCQel4@> zy33|gWaKozk7ZS?L{6J^-8!R9271aI1UfbDmUd4lcfIE_&+xQk9tm9Ii~XU~>U{tJ z0(tU-6o=pt?*kkPmRQtbMV;S5DO?{J6T})ViB@Ky!S*b-2@FZQ+*xz%1%ukIo5rBb zzp@K>`&}pd2pO+WlthkJ2{ANrEY*#yz1vzd7V_%9B;~YQv_C$ipCS!k`{s4j{Ks>>#XJ<*J^Ba

C%fN^50!HT^ zOt9Z|c1JZL zxDLfrMkc@3Kh56qupU)Nhg`twW;((Lu%H16^)7}-UeYd~N>f{cAo$*#VX=1)AElwS z1SuIQ56|*-Z~O{CSeiCm;2=h3=|bw0AtLkwFc=m6RyWx!Y~VM3Fj$pew#^X{Ee(P3L9>160bvsUH za_~I0A>MYI*xonod=~A?{hk-&SF?faeCzzdt$`NK!eh9QBXwd$m8|n+@aISadftTs z+gf_QFawQc*w6*zP3;IMZH6br%kl@$&0B(Vcm+onWiomkn^qSHFlm+8e{I*y(6D2A zVt*W3|NeW+o_xlSusVVmQ&%Qv5;l-NWOD~aJf@+al*ZX`#shsx`RRjS>r$30ero z2(k>bK&PGxbO)}7usv?aUJDHetbgH!f(pw8ZNe3tO_V*O$=mN8-ml$R5j@!BRq1n- zEcpTb2Kp%qwtDLyc{c0?Lz_~(tEOa-X=?@KinN?x-{fg6Y%RP8uNBpZx2(K;vZNmq zaI~e#D>*a9pZ5@QeCs4=29nv@p78}hgL4X8k20I9+^o*B==sVVx4x76EM&n7m=qKE z=wOkO$mtm9D6A)~qdiPtXY{1Gqp-ARP99nBe13XCvCBnU=j-e`G%>RNvdDyMkIeSy%;%m`xQL<(uI1%X1X6H4)H1OJXZ>AG7kYF9kf7ji!V9@xw)W zO&*EIsn^zynm$ppB$s3cNr<<9g6K&uZ)dDpV+VG^)g%BHAJFF!Q-4nM ztLI=ta-L!4dZ*r&zEEnw4ei)H1tys~jSO;% zS#xeo+#hFt#ItYu4O&zJG8!m`7Jz0%Ipm0A?<+7)x=0HaM8W7_sT%)p;2jN_c6>sv z$WImA_Qf``Ts^1DN}NYhKJlJJGnjiZmt&`b8#$?<6Hki$5afyIZxR4maPy(eNBZPgm;dsM)qBaeWw2gn9g&cmWzx^vhuRh5sF|Q-{CSxg4F`9=yr0V-+FsJ;=Cedz8u)e?CnIX2J`>J{ z^v=2p?d>>bqiF@6BVz7(geTeedaSLO@ft2$Lota2h}+YA@>jjH*DootOny)u^36P_ zAe;)Q*P16ZhY2K@n1E(Lq=B(9W}k>lyRe#y`RX*H=;l&GQqXzbz-(s%0?{nx4L=wYlqb}Ufe$jFQVk@C99!kz=cdL|GNu%hS{rk-lup)ZIVSO~c&k6u@V zu&LymHWHqfy@ag21ntS_mlKCvL}sP_>(}5(ujgCX9?QM2ub1fgV^`5kj}CSn_|+?8iI?~O6HntD8|$TSdKs2p zb^=?i<9#i+a=;2dz=QIhvCN!Ne~~h8Gh?A&RhmDAGOR|siSRd|$c2zdyaW*e$wF1@ zw2@v0h%tO$A~>OOH7Ze5Bo5IXuPLM9EbIl+iQ^GMCImU5di^89$>ybjvI5ULTzy_+ zML3dWG$J7vlQnx};lm*nhK<*c97s;46p$#o1$N5Oh&#FHFv;=ypm3U}g(r;N$0a*0 zq5Ze`8;(yiUnyz%oHm~CzeDOyw!0n@a!pu93v9YA8!0as;l$R=VFR>|xk>Y*15xhE ziV2Jw(*xA7_)$TtDH0$^4`nnU0lYnOv1%#>y-KW^24_xOD?+8hQIIWr7a=n&{lsAb z$URq~9Hq6kIQk@ z_N8#6$3Oxt!R9H=VW{3UqgN4NqYYq;X+*GM_NNLV{umY|rheSRGd}up4^S;`vc4li z#b5}D%EP>)^%sQDde3*|b8CcPbV>p@(hIz$V93CbfU7FewXy@Pr;bb!JCMi_vx&}wL=wTqEx%*HSvI~I!Ucsq zz;T+V8wS29NOc2k7{mKs_~}c;fFNU5khx|4d@&6WfVlvyLs@XljXL}W6RB_>VNM-@ zlfe>ubxv6|=XZpy1|J+5Xqr#urvfyn8nA)6c{y>(7||pEXbKne^W#Wp3#H~R1}WWE zGV^Y%JDASEA@*jcatJ=C#BApusr@hzU*vSGqy$3sG$QVronf!N2%xu*(c}EdPELRT~ep)=K(Y@tF#PR_`BuH9wKT~X4CDHFk zLL&s&ASA5P8*P9kRjKB_d88yRIqvod$JkP_28P}!^m`1|R0|J!I=Zu)DmE^nBXjD- zX=|oF*$Cgg3y4Da!+){6kqZV;eflggIF1n#rL5tx-f2zJxC~?91*tf$*0DwZ*Xe`9 zX90J?LtPOY&7t&xSlsui#E{7^6=Z{Mz+2oFiak3ox!Ow_%GiDJ6)vC3CArAyo%Whx z&HJKDwSU)mFe2`vDSz^rKW}I%{1Hw@B~$g! z;?yN$H^L}K^`c$A8#Bi*GVL!;yxzOO_CA^O%q461>i_mhvCtoV;8~BA0!h_dF)ZGX zy8>-GJc%W;xLaHl9mK_*`r>N*NeonKIR(?T=ysR@|MbCkBC@0stJZPnKkkitvO;~0 z+3re1GLl6?ev~1o-FeP}TF*IftD+rCiGXbi${`j%2Vm1w`QOko7rE%ds#Is?vO(mQO&4=^bw`Nbb=}DzFF3!pfJKSch#Z~zaU6e+0-Q# ze0@7qYD(#svO!E7jB;EG+%`q1L;a=|&-|}Lqa;_YVCidbehycJDnPZl8bIk%A%PKotNtAB@(R8?Yyqd1qbhW+q){2`pWZMs*# zR$C+w-FKjRi|ICnL5cAbPj)qW*jH@vv%|En`E`bhl2zHeb zBvD>tH~+B!nbW9FC1W?DExxsAI+;!cr_?N_%Csw}OEDaDWocMf+3GxrB@HE<3PT@x zDS*iei-~eed;Lj+Y{*ObF(sWMA)y}5b(x`iSV)D|Ez`eM4-1tKl~}v|2nm;zm4=TJ z=iPcR+z2S90?w_1p$dEOIvi-nx*TvH>cesn+k<>hL1 zw3JxczF9*PG{29%YF!`|CpRsD9E7Q+s;;-H;LAbl#t&+Y#p=cfp}(~cw*4YyK)pjq z(~<)E)j*?pfINaQtc{L_WBlF5T8IBoO40N>Ww3Mjl$kA!7lcU7ZFdjedDdSmQ$*QE ze2T-n!ZdjS*6nTQa6kPw&lOJK-%DdnbwH)08URBZ4;d>5szk+1rK+0NXT# zAKf9HKl+3qTTGkO`P???i)}V!yd^%D^aV@qgyk?Puv}WFz3Eh&l@opT2w#RLB;Mb* z3(SBoH5rKAyzK;Lz|*^O!47_}7smvL#lsF(_<=E;SM`jlxkU|GvGen%s=<=(CapIi z5n3npGR?#>O;cZ7#7XW}b1uazhB=hUKH>a2`aX;s_lTJOs~@q8xO9z$GlN+$0;`$L z_GA(s_HhWiFJY&QA@BS(RVfd`bh7Q_fOJe;K0|#h-T>^uD=)QXO~l21tC)}=N31-+ zukHSf(+?HdY>1fzPc|s%6oR726mEVANaM5GkWKBTQkwqp5x>k}?GPl-yIjc>q+a%o zIXVygunwDMs=QWsZTGP$jX~q}vOvog&3fWFF?8@<(e})e_Fi41-OSOj(3v4J++^(q z5zoGLbux9qOSv@;7CVX`y7&JZ2I@ChC5wW{iLX{+(-Bxax3B)5FL|wylyoQ;5+yiE zQTX-KX0S|j$HhsQz_G0Fo-UbqHkL;jQP7+CLH9C}(mnbuF1fjuEYcu@j$$#+=r2Bs zFe}S?#|mbbN8ct!?DP#TQ=dV9YLxZSRr-%?={31R&~s-euhtrwPvf z45CtFdA#qPas-|qVyD((XJ~TKA*a0$srfOjbH+lfLunquvdCu+AC@_o&mRb_eZCkP z7`6lzXLy(Z*&;hUON>FbMZpCP3T*~(`OT6;6J1rZhoT^Qboe$WeKMTo_L{G9`Z<#3 zAf->9h@73%%>Ow`3)@7+4lSdnA$M=cMGAbK@R3`PjP}XQfJ4l{K_mN_ZQ1~Kp7 z`~0vyY>5qXdQ*rfAeF_rvWVKeE-UNVe+IEgycGTb${?#UFYZZTCCK-OgInK1akJ~* z%lQRMtb@;0egP-}<-UJGCZa3d`}c&%+LcQ|#JW%zgl^uA4lkY^@U;x&;*Ckt=%=4m-wQPxhf zIZhl9xwoo@b&fCMZ2pZPs2d4@@4w*!o|j}1Z--OwTH<#Pb{Ut3WSnh@T(97@QwVH)OrP6#^W}HZ{O2Xx@0D>aZS*!?K}}9ZR2{sewUB;wdsb+&(j= zvHZUj{yt~EmeF;UUpF1b=Q@z_m_e&^E4A=CKqY)Y^p3OOdo>IPrDm3?o?<8wy?u54 zE@7;?hBMwn`VH6fB8Sj-qo7{U5h#*W_UqpxEZK4=CcgTvGZu)UVLwu;J6ATd4;{KZI?6Sqi%TYEr~e9;1KqL6Z}#BT9s z$;t$r+oM#9hT8QEytxooXw=iO=e&#E4miN><3QUt7JB+nS(|joo}A>@@b9ja<4ZS1 zP^qd>q&{qq&D2q|4yF+;O8Q08X?b%-^j@q+VTm}SeRU>aqdy>w+7CYCrH>B!W$vP?jO`*&q zGxzT;KlqC;&MDX&sR*O9+Qz`8bMo_rR7tnw`d5oTZpm6KW`sKItrUMMXB!ZP0F_=F zusAe^dSHOAiuE1zLEk$NzbWT%4`i|n{YkNBX#b+v#lWtF+6;Heo+B?Zx@;?jRFHFT zIAjb_Nu;lY-%8^DRqXPmt#3I~NO@2Gq`CnV4~fZX&(^VpxlTPpy6>xVAELAj2;?Kh zpC)TLj(mnc{TJRm2lu}euE>e|wSRkYfY>_h6{*xVBR|m1!3jCl+qli#23BR9E_JD? z1YVVv zc7~Gy_v7QJv74(N4Bwv!hIZBiz(KyIm*K-ukNKm(b)wa6Z|Me^`+*GDJj6@af%V%I zIMgrYscT=nx82Wu^)tdEUxsjeJAy$ZWGXTogPw&yecWEv!>itCOc*s_$gKvITq(W< zgo2)#w1!^==Dfr8R=Eo>8z-tah?=r}|0}o*06$dE`jvms6 zj_yC)fad+paj)d31lf0e;DWe7Yg(pq|35a=o4~`s!9^ymZ)#iI&J?6UbQD9iHgPzA z;}A>wXZT0;lvlJDfG`O~dlf?c!5!)Q9f1B_usyLh+3#9WaQ|1WxEjG{9Hc>WJ^wLF zKJM6>6RoEhj+MmLqe?i(rEVeKu4@3mtytubW`Lm8X;4pg{zrU;$4G z1EckRTyXKr-f~{hXP_^T8RLvaJ~Qx<4=+t@;+{-7LY?&si~nzTOV7gpbhlXegLAdb z?5yZs;#7?cJ5CYC9e@jJ@5r>UHPcjgu+h;pOBXRuaKgo}+;~@Tze~=7lL*8TmE}i&m`?`G z{E>e9fAyYJjUVs>2;=IB&Fb1L<|@_@qb?NNXG2{4cB~mXehTn%ZITC;sS~^zEUsKN zZ}#Z&JeB+7OcBeE^XN6~#)t;q-BNsrFDik=XZFsYI#`7Co}l0vqjYwZx_J zsh}&QH@?u#tV%NoG##{bZXtQ~ayzSfO|AjDpwTJ+mI`$jUZ?mT_n8c;>I_kHA*w_NS| zzJslDhLG>s{p)LXrUo;YbMD#U^m%{e_x*pcfLJ)za6Oo0iypJatZ}pj--bT;IGt|# zuE$CUZ{r_EkBWhRXY@Fr{7g<$J7L4CtmkL0wZPuYR;E@25;2sQ%_??>S+|r#@`Iv)F!-WWuxJ{NkQ5<`?DqF$rJd zWt8B!8d2rsz`*DKh;supgwTBbl1Mh4qH1~CO-VDR`tqLf%Ne77fgfYCo0q+n;2#it zjU!_ZP(jMwnU3{T94h<#&@O>q>$}z!v?d;QD4t9iHJm9X+wG?%T_*8KHp=>rdHbj~JjG&n`DV1UO#)ILgker}wBE zUkH$*S^Pz|?kd(Dg6yk@F7kvi=yJC=k&#RQ7B3KcExp^Jlk$hp_pldGU%a12cV6gf zKCy~sIC(Y~cYHxjMX7X)xhr%)6jtN5%=*hSMDA4baw+_j42fazr-FdEBkJ zL9xh`70BOnX+=9?HLG*WEL98xy~_tC6htl!10!3EGDSyrKx0lBLwtxfzL!JM(cv>* z=SQjh3G(l$+8JoJK2iM0eIAx{wh;wE|bezzhdXXX%-lKJ=NpuP&E$vr0V7h#@%Cl!-0N`4Pc6=K1^LguuNSM(6i@FIR6-E zmfYps)N-8y$g6aRqk&>nxNcb1ubVyw{z^WC_*K;Kx*13`Q2T}JXN(vFYXoVYO^u7BGE>16T1_N$$TDU1v=)(_>x3^%H{(xvw5d_<4?i zX9-|TLqtXoSqr+TVS^sgb43WqxUo4Ndr`fuf+PIBg3S5xX-oMNzE?ZRSbn;hiYphKh%nEy2DI3U~s6u?CkoKbQdRHOOvAU=GzyI zetZ$co&#F2a9&wm-KlD#-)n z=qy@OWxxfbQ{~-7&9W=(_3~13*f+)G1)_??#qomUnk0OfDzX$?Tu+m3ZeCc|Oy>QO zX5vW22+RC`u95dH=&K~Yt85bfC}4DyYF)lH(l1FJ_#-8#oUPI>@HtWjaUWV4_sO)Y zd_^uq6_t;F$g7pp8*MN?9r_(cR*v;vv%JXgX^wka$d|!C+>vcAH z*KlP!{Szd>PkOJswbQp8tNHT2ndLkqkWS6GUXz5Gy{;6^reQbs<7yL|ZBZgXvrOK}c3B&5*>n4@LDtn>M{zCipJ~7@`_(gsU^t82}`g z7#NU}T(pLWcw9|3d0x$hcy1ogp3!)ouZdipw9+(PdHNNIp?;Q_3W$rlbf1K{d|Sa$ zUSs%e&hHK|#|;bd%b)UWILs^|Db|=Dqr4h7gVh1mA5CFXj>|oE@V+|De%_Jn zaUT8S>PSxy?ASH$O^}b@E639JUye)oXW$Mm7w0j^`cHgOWh0S zV2g;O(w9Ko?^!Ocz5?4u@jHj%(JI`;nt5q|r3kXqqyXR8booU8toCN_!%c6$CDcLA z0LqHA6RHUpWGo?x-!{wSv)FB9=LJuy$U-AFLH_xh>&Z&`;PyD zX0BwQQLdu_qNUc3h)*d#3(TM6-g)nXH1%&D9o*}#+amlyT#~LC20>$(oZ`O>KO9KZ z`Eb$Ijppe_{Ou6ED*~@3H)zEjfow4mVEv9WE)dPCq=9_EiMvkBZ`H{BIQ{M~2(&0b z%lXGWe^tffPTLX=niy21o&Sl5rjM<;BYhGXMxA$}w8~-oh>`Pq zZ%)Kqvw~*Mk>V6quc`m(>GIG16n+b7eG|6O=I)x44yF5L)SFNEhiW!3ch_-3TbKwc zo-@fKuWrt;>t%Ikp_u?LA6Pu_S!=$){w=)*5m7)n7SlH~~OHW;TdHO69VYAUg-v}_&UHsm+`vxYaY*FZ5yoq0nNbay7Y{!+11An29GV?BgjXBe+Lte@9m_a6HX8@WWBScx1qXx{48H$BU5JHR;$CY zCc^j?vC;iuG+M|C$GC#`!k_D9#nV$fJ^6w9_oT!*t{#TEP*tfImh~vG<5q7aKLU$QPPy#Dscrv zgZ(!?YqrRe2VRg{E2MK~f|NtLGc2v0Q{7`IIt>QHX9K)gZG2(<+beGH_3+x>s!vX3 zCkS;m<$U=0buelm_lFJzo0=n)MAU@gDB79ANnsPGcXl>F3Fd%x*{tJm;{=d&QJ9VD zqdQ%ylVP{MtLwCv{DW@D8NFE+c)bO~E|hux`li=;d?rVN3BoVUo(J@89_daoqk6_edXx_}1aLrwhj#VXrSAY4$)dE8U<8 zY2^Q+17x8?C|R;FvUYD_7R`Qb-c($wQHnqXEak?WGOAnjX!{i^#|sh?7$dU#;sp{I z(_*kF+z>{MIU|l`Tk`RR?GTRde#F0)ahKLfmr%^e26vFH4ur;ppo%aFstErD8oMYB zmOW0X@;n{V)L8k&LO*>59XT|SP*#=)7vR^()2Zo;t#^JKUZ@#f2lH##V8zD3jiFjJ_Z!l& znOqt2={qu9livyrY%1{)>W^))0f@7NTeW3X{OhX&Tm0kBsVe|b^w&<3LLP~)%Kp+u z7yNi}awVuip8tJH(CRkYec8bnVXRa;QON4T^EJnakLF0e%|cbGL$ys&&u!KE=c~Z) z#l|D?fjkeKW2=N!T>sU86J2*y5>fM^SsxxG%zmj{LJW1qIUx=6J^iJP$LTh@NJ3iEF1 z_-;&Am7HZiF21E@fBfV^OznI6cm~g>uV4ry#Fb&5p6SG?K0oWD_h6t+);~>O8A445 z56j}8AgY7C2NemvBwrmKgj}6YPkUZuZH-@@&Vl$aWOwUoAN+Jl2%-Mc^xSbkliDdm zQ#`$Cxu(y+HiWx1-kAdERVPb$=P^qFhKV{eqqJbcGZtljP7>*!1@_~5K7M@D`of@; z*N~OG$_xwYlwF1(??|_+z5oG`+@7wo>DPdBSTfGY zk7<^l=5hxCk-{Db%QQmkwwr+HW}qPPPEsZ3@G(P*gm{aV^rO9gO32u?Rn zny$8qJWo3G@~=LeABjwL^ziGDc#COyT-Muf9eB^AWzpaLP6lb161j7s^jt88@S^v; zy+5?Y-eDy&`ONeZ6!RiGk4)O8?k8<@vjv>+Dht1@&*xF`coK$9B&_9^vYY&7ugdeR znYX_8v8fy|qlFT`7zgcan}0gq&W;H-=q3nGw(zffym};5RjP!Ni7|Dlp1Kk*rxNQJ z*5>AGRF!kZqmtX$3rm`%Kh zHXiMEQGU}#zkn`AKRxXb%R5Pm1c3Hr_fn@O{MPSx#bmkW_kYSYDGFH-ddRSexR0f% zRqZD6bXwQwu&LD_5V1nLe+OI|4Ij%aTI^x{PPlxI7yS5(1$hiE^)dR!nYmndMl(q` zROZm*HN~3Yzlu@WJ!^>&tLkaFFx{;s!0nZ(3=d5W1bBGpa+tKqphH>WQR=-P!8*kL z{Jt78g0n3sPzi|(YsD_&4b1QB&~j!61&!#(AY-G}0)`RyVt5H=ehE^|l8P=eNq_uu ztGONmsDpen=c;IP+7l!A>8o0)0=$1RAhu5EGt~-S;CsO^7Yu74J0w76e9I(1mr zI~#;vQICM{^W=g?FKXtxiOts}pgnDp#uLu+JFxP%PW1^gy8AO^$-EfVS+{XiotwW|*AYpqb^&INtPL5F7r?)`!9Y(LzxqW>6fwXj z6D%>vHN0!HYX{c+KknHZGlhc7{cX*^h=8Ce@obSqZQc%f^l@*eQptfvSO^2oL;B>4 zslQ1GLZGdPrQN^6-P``*Hx#^+o#IB%L3gx;Y7~zT+pjcAah(nY<4y+pw@1uNExm8r zrj)oC<#TKcKi9NyLFBS!+uoPLBs0BoUHtck5!H#>_(0pvwqIWg4d9>=#IAKm zXB}?7=WritGJP0&{*w=E)ciklCs728g#SOn?RSdlUVzG5>wxtrrV-vR|3eW~q4DeC z+Js-7jp-!zq+4w)Y+kl;kEw9=k-!gxU94 ziMxOQ)_hA|9_q_`Yq>atI`352_Hk%)Bo>tIOlS3J*q~kj0>8fB9 zJtpd}mB4`UpaJ#PU!c|l6vhs)_khY4u+QF9w($?c`N`y719dcG9=bCSu>OOJcT6CW z00>f0Ai|o_+I3oXSl3^~1+q;Cv8lwo_v!zRqbn4u{a52wY1f~|tyZ{sQ6@#FrJ=(6 zwnArn@|WyJn&oMg0p@Q*jz1;HQ$vPRG~8<@`wiZrm%r??niZc5fS*Xk{;ixn(D}F4 zEQZjv$X{7=qxe6w=GXug8AV^9qQ4&WSFF6I@;!!dD;4eMGMguUh-*alMLR-?la~S?TAL-g=aeXpL@QW}Y<5qh6F#+H4NnP*K%3JcSWP-G|TraG`s4he#ZYXt{QcSa3_S z^wNQMNAFBW<%omn*yV$Pv#y$u&Y;d9>p@&@R_^yl8S8J$Y?JRo%^fCu4Ym$9+Bv%l zs|V=9Fnx&h{V^d_OY9(C2k{A*P1_7Dm#*~iWBayd-Y@qM;F!u15u`OuWfN@Zf6&my ztzKDi6l|V{A4-!U_NkNxL-kl)3i?JeXK71~oCUdBtfTSn4c3>;{-<4HK<@(2Ev}@) zrw2RWy6briL^vlK!eQK|wE*qygxe76$PrXSv0E#~78vj54V}=Scy$=~n!)T)cqqAa zl6Ah@M`W#0;rJt?g>V6SzZ4$E9qtmjc;!TL7z#F+s-GXYih$5(Dw}Vx7PFB7i(m*_ ztS2bNeynC=`L=~DbH=V0>K|&}qXQpx(c%>cO35K1{k(CGM2|Oar3I&9A{TOd3lXGQ zfT$)PGZ6X;!EWR)agsM`@r3OfkiL~u)iGk5^Rww_8j@e$YeEG~2@qItCU%n90rNys zV6-FhY@-0amW3|ot%S%$yzed1HBC{-hV1RX7oTx3Feq5o9~g)&vAzc6MZ90qqPA|fGG3k1 zv3%ID7Q_ext(c9bVN^LXOfmglw&h`H!Nxmo76JhxeXYc@I@^upXShdS?|pi7a1H41 z;EYrrDklT5Wk>r=G7#k_iAv1I3Jw?=F94X-@U5XZ_5Df~rre-@XO4`+wyyr<$=(tI z-o$s0DW7`n;zxFQvCJvbQq26J8&_E3w8q_Q&qi80g*Pn>%Ifi--R2Q$3OCOVJ)*t- zHFK$LbM{bb_x`nKxBnB*4}{%0hP+LOq#3Q8mET5R=?EM;iT!wW`vV^gh0${jWUzHj z5E^9NoVa~(@7gm&HOzTyio_p+`|LLK=lzrLq_0sGZf~C=J^(wD4&O^KGu_Qj^`lbLPh+`-W;SPXj8{8#z?KzC8+4yS?Ts zMX~^}P*hon$lBdZEq?)(94t%6RJ`FsF$$k233BwvzMEe-TP5kK)$E@6G7lkF2S-@o0hxUBA#4e|{ zkyW1J3Knc{)Z*lW1W%aT6B-@aBZ;#=s&Jc$=I;PA0RLT&Swn$weBkb(jo1xse31`#a!A_RO3e*m3FH433JH(Knt46#vcdt&k8K(u1RidaEqsSJb;DK8=}@9G~dZN;C;@6#b!$$49Nq4z4?@Ew_4UKO1tA+4gIJj z<{VZ5nhf+d0TfD2cZ3DlzH*ja>m>kMHv&z;R(-oaYIdlJ9AT@ySx{`M!(z zSpFa8-a4wPeqH~bfPxAvKqM3v-7QEhQo51uQV>D98!Q$G(w))`i;(W6lG5EsH`2{J z@p;bPXUA{vbIy3r`+Mi`KgVG9T-SYlKlgR(3P3CMk^>;Gx2sgqF1@XQXCAfD2P_u^ zK3BoR9^$jc?T_?vpb%lLb(c8E;L3WGDtDwS3dK!EY=<>t^jUw%i|*4XRiep3N%XfC z2}h3-Ke7zwg1rz&qv0OuE+K_KdcbX)k_((p%#n)(bN>y>=$TSSOd8-9tQj}LR~OUv zgI92$$;pQhNS91q!?EKXQB(mGK**FEaEc_S?y{gi=Ia~})IOYAInWe34Tm}Lmwo;6 zWypjR9}haBF!xO^0=ro>2-@#O?sA334*=D90==(`k%L*R>y{MiaD2JJXQ<6t>b3<} zUUE$i0*7zgMaPCol2ne48beE3I6DZ1vfZ1_?gK>ectdHz6JkdaOXX-r@?;T(o}TRa z!HGS;m9Ww3Dk3${NgqtFslmhKhz&FvB-o}jr~%bTdrN@U@43jRa!<9Rdd`|15V(icZEVT4yFJLQW=Inzi5!7MPbXj--a@ms~YQPt`7UCmuu;e)qmH%5GhJ8^!Eqm4{)t2K4@eDHx)=?px~mj$mu2v0A=A)dDC zx;fAspt>-y`W|#JN!D|mCi7{1Pac4$k7-2fFc5BV$7Y)q0J>A%?znZl==s1EPGnfV z;wqhOCJGPGcogDMrA+RD%8Xm(x<8<8wxbHom>T4v_F=Zvul z8DI8x@MQ0ZZ^uN-+)4dLe#p7m=6Y=K*-+qw5%$$*!28r@E3+I+AU{Px=nsuWEXu=Q zaj-cv|ENe@u%dyhv4A67kMgcYiGhWunY z=I?KL*=8MgWHYwkK;` zUqOFbK|6Y`NAYmpndy1%v{!lnrz)_o4Cmq9rEi^wC|^c-T={(X-Fy%oO-G1p#e_Y` zcMCAEK|c9up{z!(jM`i&x5p7SyA6VF*_Z%o%iZ}ol|bk?W|wOc3kG7#3#}tL`mtG) zwq)7MlFP`;@ohqfm6lZJS-OkX2p;gO=Ai@aNRVyF`{i4I&r`+#6)@bhaC-b^hA?9A zn{|Bv_zk=Agm#DtRL2Y0N1J9<4UB8WO(uJ@yC(l$Wf+^#Gd5rYx(uXQ#1;*qV+9`l}sfSv8$6<_r6nPM^` zR1k3l!@_)x{-x^Oe@l-in-OmpCs_<~KrUCvM|$lnc_sMaTVx3ZVacR~&LqEEgg5wW2n$#JTstmaS(!78VA&M(vv%&;?&LQA_bNT`?k!*ytOI}a@lI7= z#FaKV9EeHv!wAUnMx5HW_LkeaQwP^Bem+Y=coZJ1Wel`tKV&D5sG{&<2DLi-|4+2K zcVg!QgZ7iwuXl=}o-EfDZz;UU0u`>nC9LqtCPg#C_EUEG;5kCqPLU#tJ0#EGv9p(G zKsyyqVVuRVM1LB^o$I?$%>e+IIyZ~{wVNN9F8*}v*&j;2U~M%9L6%nX$T$Rl(%silNRB*{d`$(#Kb3D#4PKr&t~HR%ag}hC#uHW&{{cA z(7FfJ6bOq-eCVnR8u+OPU!jW-6J85gt-@?7#=g<-J6_96T@Z`u)|L4moY;d;T)yQ% zh`eKL31viLWd~gOpa3Y_ccWwtrJBpg*lJ$44~d`UBfOOKQpgX&ynWAJ5DcnIMyQ#i z0Ov(R7n#X4JNjsTj6s=ZLKqW8N_A7>y&VacTB>9@5h((O&9YVq((H65)LV4AG8+o^ zf*W=iu5hmby#7zw$OgG&#}FK*X!;*|C!39F@n#89l7j1qA?aBvKcif5937j(1eNA%Z zM9L~;A+^;d!)B~jM9NKWE-H94G{M5@WS}u^k|TKHUX{tqZ?QP|@mLQ|-M(NpKXo|9 z*lC+yv@fEqY&<_sz+{VPgd5t{ed}gcId#%w+ga%Ix{7)6iS!ebsXSBB;;AP+)*h$d zt%&9E?X0_H2(+>FCgkHL5q1f966}edXSC_EQ8|WGN7Fml)cW!HrO&kqCu2TUwdBh~ zw^T?Zi4;8imp^ufxLHgPSQJpvcRl%GAX+u~WNP|XHqtYmbRX4iq`iS3fXyIiiR{9Q zMvag0^*|kh>dwfNYSTr%_!{7woFW}AV-LeXncG%sWB~5B2bBXHCay%uV6dw4I11xf^8NrJbMhVgzNfJo8r$=!RJ>I_W%xMvX5YSve#sKF86_D zH$<3E$_4Fi=FF1_&9;s>SUnDKJJ|^ha4F}|haZZwTs*FARqmh#n#p>>QX?dQE}S1n zO}6nN3b6ScLVw^ zsSvwxN5HSnR9PLgcRX-T^`s;Vp5~%lTKshFUf%WZlw{XXdHd_$(+CR`chy30I&0q@egOW+6qYn z?Dy+-Viw?(w(6*L>)}*o6yW~m5GEB`S|u-EhJOFKFqmOPqjwOuj1nsg=$lGCgP*;^ z9LJCDg?O>L%amK@_lauq4g)`?&1GAswT0y=!5r=6jT>|4^a9H#w_ltGlD@{quK{zeL~>fDQTwQ@8x~7vGP}m1V$O+ zOq@$)i_&Z|lKBA?@Zg>)2h*WnqD@~Ie{0#x3TSa4fbZ78>gytZR4P(4m8D^HnE`%L zQ2o9MX6!nK&q#u<72lDJCHuV+=K;z}r+lK}5J)J)7v<|0Ie(|0FEz4*BqjE+5?;aEQ@Zi4nRel%?+fd;f7*5591g-fgFLT?Cr{K{1E?63~B4IPqWukGvO6ROEC~ z&DkwA#rT>kovKGHG0MkA5IWI?-4h`xipsThKyvqp$ypEH((4^80R2hLe{|;+{SH<)Ve-xk z^EM_}aC?q3pD3h9f4s3E#95`Ft1`1}dGXD5Ib|0DldJwbB;q?6a6-;s-+L_!6MWjp z%=UW=%)fIKx!vXZn!YB@S^H7C(Dj3k+7)o3!oWNWzh8Fm33mI=XQWVGe%67SlXg}V zbWvm5G5y_-kjW=o(~Y3s-2AKPoyM0<(y>O6-qa9V5ZNHp4fv|++r)#b5j3$`@*_H- zhR<%o4v{mRutH2FTBG3nI|Is-=f^!(CoN?~4X&noLVg!u*l5(H%(8Y1D0jHzgnXM? zjrrB#Tb*+FpjM3bw`fH7v#BzP`&E&X7fO#}tViok+-`duUdTUq8$j7}O=fgce=dF5 zx(B82La`E<6?HH-Qy8DUh@^oljVTIq9q%Yb*sL7u8-+&D*+qD#WpVE^9GskBCw*NK z>WWt(KtXL_rO64JbhFP`yGaxsnL*uf*kTWsfgbfazIe5iwevb%yn-<+suN8LEYQT9 zwF`!DKtuZ>Wt}2*0p?hO8L^VcW8ra4Z;*%1I(3i>QUfShlO6!=jJhnc_8y|~qHlbE z>8xfQ*34$MP6llo1nx9_h$fr}Mr-y2?cwam)8`Psi}_bjmkI}8yN^B%PH{)yM@+z% z@ldq=rYU53Z8)HL3v3SWZ^_PK?s`&Q>yWKj7kiTWz1NP4MD$h0jMwJekTQ=V*anM3 z2$iW^Wrlc*|4$s`RTmO}&k2rC;@6kvhf=F{v8x;k@TVdEs~r4Zktvt{^)7`6K3LFg zYImy!=KvalQd!&XnM|T=WR#)6Fc&o&uHQDx;20^7#A&geemWADu$6~!Jz&+|;e5xL zW-D#+8k}`)Fuh5sH#umjP3S__TnRIGU78SicW?UxBeBN%T_7{oj47+of*rUORhIEX zZtc~+K&onI!sdPaJ2#65pA2nV6{Ry|$W78`_s=i@B%v_Qshv8Z78fkpx* z(5}~QaEL<1q(5U2Rapk1R&)F)C(sJ6Z!hYWX6}8|TABqTq!MNBoo0fPt>`uExQz9r z$K0@SGi-#_s~8(QqwmSjAMncm?=-6wR4SY2QWhLu28AvQD$t&S4Oa;)%p`!_kFsTu z;I+jZ=c`#NiavF=_4?=H_Toly{?~kGycbthuX4OwJOLTw<@Z-(YqvSFL$N9118b)Y zab(Mm8Pr7#CFu_v-- zjGRfY#$iKv{n)wU1+q>IUX(K|gHK#FOym9QsIHu}7Z--v0E~`l@_KUD$hI#rV`tm5 zui2_)&d59(wg|aOB$fuPYwAPA_fPdgxE-G4hq2~WM6B#2bF_DXK68S3T`XODE5x|q z8q#)gRgVjGSnspi(uj!n{Xs@ddkwDV7tHQ}>KERTnfr0>PaFEXgQkR~@kdrik*TDD z`Ddacx78vIe>7QVmcZoTjOOjrXs)k!+8PgIH#j(F5TJPPcIZ#rm0SdmxkmmNue|<$ zT_H2YlvAl4M}qGFXxU2b{Z=?2Cn4XlB7!Qq;|kh+bDX}VB*3#d_RdevKFQ9xN7Czv z&!$r(ga-e~q@L#qF04NpFmZ*#KcW~14xN!uCIUqKSkiC}z|Vc}*^PC6O*^T@WctSS zpyhEHi44VIWmpAF>HE+;I0bn%lLSB{ewc?9l0Tmh?I-YAXRr#T_XiJcy3omTwB(aI zIv_LN@__*=Ojm*iIJo-gLDvIbNzB~`OEXblauOga#fDZ0)AvKF$cryCeZ0hF=%eEB zcfHZI&}`&E4J=Fj$KszOksb8pvV1D22^@v9hQ1v+g}*T-?n!cxjInTAkb_@3FKPnK zq&kVo;5IG(5@4%S#;9)pCyuoZ=H!72K0iPccEo%VsrJ}8w@YzJ9%zog10_^n`~1RG za-u^h#34PIAUr{y&Gh#u<(x7l5{pzNrO)&ts_?sorJk>%XEotxc~q#eGJiVQ@@_-J z!q+X^6F;*<9u?L=lw&J%3*J0~DlsU6i!rgP4Mk`BjF+lgH#*xy%hv8&TGg>P7~dmX zx%&7A8e1e#-5&fIa@bFJW3WR{boatcrzbsC%$|hTD$d<=$CNQn7qaw_clkPC;$MUe zjDxXIyMD{JBT(BSZFrnS5ME2{z-l0RUnf1XH$pNTMmju$j<^2*#@8x}H zNS_Y#LzVE}fq6w3QF#kA4>=$MRqBh}nOZ(y{e46Zoa8`4LSiavv&f>%syk`c3OewF zPyC7-Tp6^;LVpYMN>>*L03Q{myLODxU?`b%bnPv{jDe>!Vj7XmpOgY#K+*HTi79q* z+!mCw%)n2UKe&`M_aif)%}qo({<`kYxY>wVTXA_pFmN;M>7xEJT>gZ_#N!|_o2 z6uelu(q~CwT7mb)q7AbTmQOyZA{ahqTY=pS|C$8If#3xK)sMkXd(e+cEcO<}NoovrNzT1<>-{qZm$ zbOq&es@Gl~tX^3e;ftq&XU?c7cWsGjoiudUlZVQ?8Zkgt0xqC=I-oMF4rcT*zgWaV z6h#z1y4!u-x|4K*ZEoAh z|73HQmux0Dj%oZCZSKj4&V)#e+FpMaBwP3Hwt(CtPhie>aaZ`7er@;GX>s-MjV7OJ zOu3eQwkSDxt1w__U40}9nmbb^P`w?NhYZYD6GSxS% z&Jfgb5rZMjyXrULJSly>jKeK*s0_Z`8^PyQMonNiGKuLv*q#4s&Oygh!0j=H=#^as zL_oJjgio@`5T@YyVgn(O=kGyb`VV}{5|5&*YY)QWdlh!mT_1n^aX^gZ@Hm6F{=Put z`b%CzjxV=hE~n9TqAqsp^oiLZ08ja*YmP*YZin2>sgB=+>GsLH#ksQ!Obq7$EmmDP zUt!ewujxsmw_rw(bAYQHHUunSZ8g{z+wg8XnLT_t;#7Z(-Gn`J2f7WuIahdwfel8k zRaJ{U`A1clrpue7>m2R%HlmRY8Qt_;P8c#fEb&4B! zo#XNH>+QE>?1}!T9{7TmH#G>jcqcy}ebV`Gj6B&$b4u?r`cU`Iq&8?ahQH~${*|9x z_lHrwCb`hHf?`$R=%Q83kI7U`lx?h@M-PM<#^gnnWj6r0&pJbRn6ol|?0CsBzAlqj zpcuPEiY;6ph{8Cd0j=}bB(ZzlL#d4MJ35(BI<&x^R&XScs zijp)v5N#2NNKWPbi(P&?!Ef^v_x&}PQuu#^Z7H(n9!-}4>y(H8;gXM5C%jh4q_~rg zbRM2xa=i}tO|_s#KaqgH-A&D@a|i3fXHLXY{U5MoSk%Xy+{zV@!2;w`>j>j6H)riG zmYQ)B^2@OY0r9ruil>rSQPDwijExUG-bK*ay1#O`dhYrj;>cFhny+K18&0TWs_wmz z;)pLxnF@OIpK^|CVy#D1r=&yqp3S&9!ueviwWC?aG1tnF=82SfoS`RY{iJA)6v8E#0r6SmuGvsB# z+MqFg5EDR%l%`408x_)w`#ts^0g741_`59)eu%6etmIo$!mtY3{D=MqKKTMxBA*Nh zYzDhOvXj$sK4jSYWl7}sZgCrQ0P8vybm3Os$2;6a`ZV4+4Kpr5@mlG7YnA#lZE2?$ ztZk3@AIZZr-77sU-m+{0e+5ol_FjEI-1yV~#^HOa$@-%)g>KS`yQ4#Q2MIOZD-&+r zD5Z2J=0BO`fT_Q%0bO`1A14y2dA4b3r?zO&gb)5Fj<<)^2Gu`0-t|H;5cZylhSuX| zdOtXvh5@}a4sP zc9jH6=iNpcvSmeX8Zu~drk|{u{}LQzbL$ZaIa>{Dma51)^T3LiTwc)D8Wma#N4Ku$ zP?A~PO!a_?FSLdL;E*19d846^!yftt2@(&@lLB*WpqH#%mp3DR6{8Yf76I9UUqz~k zKD6Ln_JN>)firRyU~8j~@SYl7JcJ_rT}ytF-i+quY19=aLXABD>;1%x?g}FP z?{xRuh`VIgl%DB(<8Ml6ab!mt8i>^3fk%96tmAKdvM27+cX&{|i4Pc>Z_jOG6WiF$ zO>Swk$Y~Gou(E=3a<2_vOiG>Qxke?^G<_@;&7x=INssd*q`3^EwQ(Gia&lRqs);z&>RSc z3^<|zkF;wby$G#jslO=dcMuHb=v&oMa-v@& zcS1#2CV?}kWPcOY5ms1*oZ7`dpRF37z7f(688T7AY%3-J_(+AcQ-0qS(5ZU}P=HJl zE}_EkEvM?|rx(2*6@p4vUzA{9W0VflC;I2{vXbl&e3b2E@jc~N+OUj1#ynJ9V@mbiOX z)}1~R;$%{*r%{YYXfn$|huUf7TFL0Uk*XQDD?w1lp|!>Gjvb?!bYe8@b~j^IMtx3H zXBc?eC6LHB>6&yrDDFp`qMKk$k-^PXuSkrT&PfI1lN`V}Wp=`i$7H=|w2RM#aLfAW za*;lJft54vK(*$=YSpHa10i>Nfn07FnKktIk|<+{aqf+-6rCE6(0OOE0NlW=WI+jjA`>EnI?*A3 zRxY`6J~m$Z5kkFZ!HXUCj!Ua@C;!ECr8dme#NTyRWnc`g0Siu}`x zMSEUa3H{DY%U9#guzC@P<6o$!8xHU%^<>usV7Pug?%!J7+OtBhbN;i_z4RYWH^=6U z(>?f~@Q%Ma-F1wAINcau`_guBJR?&K#Yk&p*~dCpwiG>%Y9mK5LLk>N!{%358w;|)n5HZl`rf2B&FdZ~6j8qm1M_nGRVJ{@uP3H@hqS(DvCCbItV$?;6?UplTx`4S!k9HJG4ceZ<|q#9hq}S^L3nCI zH&*xQz}fc(h|mJCJEQX|a?P|c4voT>4{YKrnj%j^fF)ga0~p_&t`5GMAWALf9G9a- zjatR48l`n&=v0F}IGxO89k-lzeStiCsAR3pHM98-PWSLXKp79;!oC&|7KxK;c3z%r zZ(+8Wjitf93S^;UybS88$|u@e2tchZ$3hBO6DKzN@BX9D-B^Frn=pMNGigGLs{T=B zF0zXPwPje;Gt!;20ec4X)e`UGNY3{dI;BND;|4WHy`SHD zn%t9$_CN*J%;G^UIe`hF+@ScXu!)PFoTxIoxQod8GLFWrx6EO}4 z(LVM;p|0vmB;IrMF<|qZEGnux;Br zc7^cPe3K+qBuG=sE*^0gFRwh%QBFBw>t((3c6HZteRr?U7u1#%0gyLxk3inyq<6vE zW#>NhBRHw5HtfxpyG{4RNLA)}urhdNoV5{V^{_&-n5j)fsE^gXuiaIf*%ws4ot=_d z4~*_LJe7^C5`4n%TPh$1=a1Dl8NNxc4`SiWZ%h-kV7fbKqNHLWy-D)@bNx?^(_^=V z-}nV<*#2vLpUar;1}n_QgZNl>flO!2)N$T!PQ@E#I|0)_&}r16xYVbUot<|A?r4u7 z)#%(GqI7vwFBVg(;j9(3?(ti@`}Bz)U9N?ybowxJi&)Ti2#ju$bQ{o)J1iGRlCJK& zK+9%{h&dz`z0ybPD2PPKNw`{*8gM+2dKGSR7+|VJ*)i0HA`MscjWFH+q0MVmR%it@ zb9W_R_(?agnl3EHRB0~r`YZ@k3oNP;x*IR7*_EiSBrUv?Taw);6rT0h$u?2H>E!c1 zU`0Lo06O_|R5`2>vGY`(dd_k+_IAI0#a9!^eV17Qei{Wq=u=7ho3FFDd zZ`*cUaUK2}0Sjd`kM~#Ea87&~!~2i+HDvr>+t)D=*?p_yfm=FbD4&C!j@@EIA_N>Z z7jJOX(qUit3m;!HySIh93Eo1Am1~v}kzZ!u0_}@t-tR}LNv|b!JNAQiHgcVbGj|sF*dCdoZz_jSV_bu1M<)vr^q!GKh+b$U^OXPE(f35ij+sL@ z=9~`o@iMULhK$i~maJSaEef)NV)$V$X5HY*V30M+oR#DLkv6QJ@AdJ$Sv2L`F+!8k|A$hhH6Zb4WK%^@OXA z0GNqcBc{A=%=Gt8wV)try=sNd*c4V`c1_nB?aSz|v5>kAK6NZWK67OD&iJkBb`w5W zAu7-v3BM@mK8IEbB1U!eF&94!H0U_JGX zc)GKKjdA$2Y$Po)`fMNdJj0ea4cWHnY|sL>UBI_Ab4E;hkm?b(QU_`RE$W{SoS4^i zPM9w)9u^$>H@1XvSChd=y^4pnIH1|R!;D#$bUUb`pG$wuM@*_R9cDCAd4nIb+OZTi zPTZK4DCRPSa;jpdF(YZ4+dy~tm+_lT$P|5!3J2Yp4+E%{2eIWroBZRjCNuTS!c6fx z}nHcTbMat>VLGV5d1oV~ik;&he3<0+eoIA+yR9<1F5i zJ_(GozlEM-`wm9>@Ni7>6_o4?*9qU&gNG`i9N#Im0b zDVV9{K*4F%rvnW*y4qkd=daCzZ%t|)XMOui$(tC!Md$Z==YGnFs02vANxZ5~4%kN! zJ&}l34Z`c>n(7t2OtS>ZYds(yB_IbSBDrHPPbItO1OwE=ZLp_TK*(A`*qp36x}r~J z;LL&>dUX~E`sZu$4q9M}F+$B$LWs&am%in(dCQ3j{HGm)1xDnVJv*$V#YJLGT87mZ=4kYy8VF*fSr1td@~ml zbct1+y9Oko`9@4WfAlbRkK;}&h`1TG@E05#(P>=sQ5;Ob!8kJJSB3MllGbSgb9M*D zf868ncS}5cZ@%G+x?^{>rV4EOi-dv0xZfwCaJEYV%{;TZYtKg6K>gkNE-%3kqQ#PW ztrHs4*3mQ~3m+TEGgKFV38qKkLo_2nUmt3nqOwkn$Q6z!6HN`LlspYFb0ts$+$}2{&3E0;YHw&rb04SC=dX6Z; zJykX2*K$@7a*P6<1yd^$7>Up-i~36IRi01V|E%XRao~@hM^P;3*%g9!Z4C~(*hE%K z`ykQcIW4<^CJpz^T<7xjw;I4KkzUjaYmn_FV|h_J*fW(%;w>J0yrzs(0a4MHjK~S{ zG015tyA+!Vw>|{+z%+T4BkK4Uw>NUdyBQ8IF7LftdIqgzW>#dO;KO@7_ymF?8av6+ z?KUwu)E4PS2K~1M!1xDkaJJunuK`yQ(|@~? zcp>5bK~dT@MkQ`|`mr_f^2T2(i7TZ8|Ja&^0vnfw+iGCg#n-CYgVJ~Zwhz;H9?V$H zm)yXGM>~i#dY+5fJ#@*1{8Gwmc{85AU$p+=f-n6BS29%;c(HuV^V`+Nym7UcXG-cO z^&HGK&L?kkIv#;z$N%bT$BteRl%0b)E1^B>pFz|=Lj5aU!(-2zDZ6SOTk5^R)nL6b z?Qik*ROXaK<%av#UENi6i#Ot*Kg-yc|L9rJOz=eO76F?cr?2qGa=rwWNhhN z4!5#a9xP}Ve^)rBxw2WT??|X_$F9(ltEvo8G_a_Fy_udU6rXR_(d48PXTfHu5!2W> zu>N2>US1{n{v0&7KWQ%gX>Ql~2PLWH6UjfnK$ZGU-CQLptq{R~zK=$>H0MZXeNg;~ zEc{_9KJOW}dC?dTHfhqmJhHd#;!Qmu1eqJR|2xc0B|0~2ZP>34_mY3W!AC!5V=i9g zEH`{h-cx42)9S1E7aIJ=^B(&OE6hXzs*k&JaAH9026q<3z+D3&P1^8OdkdOH#NvUSobua`I1j@bnOIT>V^%PPOlMQCw7+dK(TwlT z%YAajef8HmVqmZ+(|n#CXa)4c!#m^T#vb{?5a@kW!;2x>MpC(m%2syxXw&EU1|OhL z!7d3RJK*e2Y`{D*F+MB%)6G!~B2k!rW%X`BC{t;XvXtgaAGBa=D-K@M&jKCUieq)eYPC zR;c=m9HdZ96kxeS{05!BinTJz3*#jn!hnMlzpPOglSbzG0OP0+%xk;kLhAlth42;i z{7p`e_p#gW+wml@+dWQh@5dd$xGDP4O=X+Dkbm|+s12|NbT3WKaJxd<{)uvsTeUOM@jnOYpxBuw{AZ9j>SWez$%%T;^_J_b_;R&bEyj$D0O` z4GL@r{sFM%)Raqk8rl#k&J$eEDtxdL=5=2l z{--Y51;0HF;fj(7SPDp)97Xt_o-%i)Bj+STc8fZ@w1N9_i`_ag%0&vLZ6k zEBf_a--=C27f{QKE!dH`BMQlBK}B!^_v4JnxT=X|)YPS)S?;s3I~(%#^X?dk{y zI0QzX%&|BLg-|;d6WC{7Lr^Nf(J1@Nphpivm;Wg#PC}^V-18@JQrLDcxncc@U+R^w zr|TKPxA-cY3)D|Wwuv@wubxF&r~Kj-$AwW5KVc|-YW;w&KSX}uqsqV|jtrIHu6JDv ztL!iP<8Dc`-QH-TP~rvIr}Ly46Ss2f#qPfW)3lC1y-~;)&s&z}nfh3BPAtxG9es zR>^NiT8z)XLjWhbw-mSX31>~~!E+w-?)94f@6}3T?2yAr?8+ch4{!9A_59|$vTitI zG_%k9e&Uidb4!IH$2$#j-E{J9-%*o0C$Y^`C3#bxkHMaeqM`1h1wl zJ10dbEDoJ5#0!kJe_Z1rLz_R?F?>l>rsShLyvlb{nz64|h|h>e%}g}oeAtNQ7yS!8(t<%~utLfZ-LGATO5U%+Y>kTZ_%IrrBXO#* zvFIMn>VUhucKlWb;@6&xDQS99=x1{M;i<&;>k83SZhC*BfZ){Y1Q!P`bNjb4^JTBL zTHEpWiYtFOdi3ws8Bo#9nd!VU>}3f>^P>)ulpqWlEY_OPBnoIREzBxoANtw1qWY1{ z6THEIVZ{4mp4&}_-l9Ko<)c;QvQPys*s|Q(<^{7-Ozwqw`jgsnd@H%7ic~}&EEjiI zre4|X{p3F8BRg0Y4B$>D;`u#x_KmsyS}69Z0~^_Kf?7_Yg3`tdMCNVpspL{YE=N^m zW}f{Fr1*KS7=pz52xwN`Cb6cg7QiSjK-D&6P?JPo2~<`lmGUiVGpSU?BmxY1C*0ah z@?ChT>^P}kkS!RG=@Hm+zAw@B>0XTI7WwXwRh7ctJn_`s|9S4WdA*X=JArZ%y z?{x;s>tC>I8xR0K55lqEIbx;oef@^Y3MrMy+wE0Oo!Y3uo)m!mdZ$V*sK}*dM-K7MF^KhwYgf*xvf^f;8aO*!f^&Vr=q4o2bP9c%!sg*eAaj zsfs(gsBT9Y8|-da*sj|NUlH-``-0~5K;nt%r-*>rGr|`V&oPXZn+W+Go4iG$e&N)* z_=UU;Aco^?&EZ_517Cx>O-b+fbA%vuo&$i_!OD3m3l}8Nzw%)aa^*3E90nnmSpAx& ziDxz+;UB+f7umwGxYx%EVWmFqUOW8JePvv8Wog~WYTT4UA=}l>jZU)a<>NwLXVURQ zVeQhQNUpAzPoXoZvOXueoQVtOMtW9!DDZLf0`VE`Ved+g@LT1JPvS4?b#*153@Irp zd({o?!!v(Q+Q#mKS(9!y#I~Gq3jTrOToCrm#KBOs$F>T%;Rm#j0b{O2k?~je`R^o- zUvy;c)=;VRTIwCHuUxUEj)zf{E$`I4@sE$Aq{8$S7915MZ|6y;Z}@#SKX)0kX6%|$ zO4ON}W)6%kh;29@{=}vm3{E444-U{?qWXyhy<`X82a<9HwLg6ySjaXsQh)2@Xt1e^ zJwcK%u$>pB_?j-^%N4!EcL0nT22JJ%c>tnO)VD4bFn*uel*c_QhvM>@9b5E$FzRlg zn-_$_z5dAK0t*PxDg2Udi1PsM^V&P~dEQ8pCpz#_zPfV!`RN0IZatY=!qDrzeXfRt zI0xj%X|TA*SiD1hi*Z~WDfo)_5_{Ok#k1`c-{LS@>`09)5oFwZO9)pzLHdTgr&N#_ z5Rp|fd>L~WetBjwyOhLTwx%%AzCzT&)&zEk2A-Cfvt8{Q+e{T6xi;5k{a8&Nto|;m3CH=nqdo%2g{J_(E6!Rd-ZTTI(qgf<&Go0!dJFRc2 z4$ZUQ&&$2QM~=U`X9;D`NU!<2*`wamO0zZM(uVRtyf?5Z@!_3ONWM&X!~r(M}7OvPY&N-KiPq~St@}FKCfw13g)Lh9;1*id_&$WV8(Iw$3hIhPgH|% z^us^!05Rileu59iwF-?F00>AzPTS^RhQ9;s>=m!Sr76#IZnZK2SI!#8drw2exrf2` zX52HvvA*th0ngxW<+v$_qEm1DLOEQMw74@;Q3tS-l?ZBrQk{y%wZE)hu;T&dg4@e| zE-QBSd?jLTlPr3Ue+F}#?8-LTwgQv-e%j#x#<8Zif{?zi)y&>^u~@f^mO)NhFB>E8 zqkc`>(Y?-yg8&>M(IuA)c2yVCk@}&WkS`x|ZRZ{G`8z9ZKC!&_HPFzzs;$`-+QR0E z=f#pDwKM$mmjFFglQg8y_17ZYL4#d=v_%jr0&@Fy1$)esNSe2zmQ1Rcc5g-fE{qh>q}( zp(2D3p0SVLu`#I)yu*wi&u&^Au_ofenazJKD(u_Fz7wq1SoIXSFb@xQ>J^-rczEI!y zAD=)6G%*6iXTMxYr=@N zYr@I)3$;yp>BDBDZ*q&%1`nfj0lC)R0)e}JU8L$!W z@>SI54|yn)rf6BFVFOo%L&Ovcc`6|M%%63HX7q7jz2c#z&Ale4C+`>fc=RedHt)xi zY<*z@cX6;^%Tnt#!xAo8X_U(Hn!d~Tejeb1t5XYzPI!Nx$A3POGsf>Ytv2R2INwO$ zbtOH|r9-yCChv2PLbMn7{ot-)d-IJyW2|quKN|<5WzdU>l<@YTz~a zY9GjDhSEX-+;TDD+Xy9*!ol-1Mu?}`uRA*ZYVWr+w4B*_8)-KHiFV%7cLZ=Z$vgu> zA@ELS8BOZs@SvE;tqs4-;z6a40xuUy^rJR*DE2g8Fu;jvrY;=D#L06Ppffk^-k^s3 zmcDpb>+S#_@tkY*6z>G-uuVm2VgE}vxNDF)@WRaKzF)^=Qz!EnR(!Cmkk(OWmdq?&f?4Hne^0!OnS7txNwzVbJRPQWIV z`VenSSmwJOO)2AfvfK%d0@d)xZV=Ph@4qmlb&MN zQuK9c|3+CHuqlkeg3tfu%5AxayX?6;x&7)O_>Lb^kuX8pl)57Yo9eohLWvx|c6UEM zvmWE4Eqii$(UBvOi5h@=>b6`H8T3wH&RxuOe^KAnj%ADnKl^4rwVByBRL6AF?Sptn zI+48O2}OdtB5octiEnI?M+O_N5AJjSDK7z03A@CFN2aT{RGbGQ$K}?XbqTJ}7mTZec;gOm^&i@D6h3mW54aO<@R*}` zq-j5c=`4Ze{9y1W$5~WBoMh(i3g8P}#aX)qrELng?tY=n?XaYrFTxN~xVeW=CU5bI z!>$WJq`CUZnGtwyIQ*y%O(b)rd}N1#s(#10)T{8Atl%4EbW6Jr)pNE)fG* zbPY&?@`M?snQkwBq+_2+&zC1dtBFX$0g%|%XP{azFT#4#O@xm3@Olu{dnAe2kb=|a z>h(m&kECyRsXlb2f5D_yZm*l*>0OgV?zjZT`Nf)HdkB>aLVyq!LOdNlr8+|kJp&4= zwz8PcMnGV`ZseHZxkwj&I99_!yu+#iUVOA**X58&Xa3hv)w3-rl2my{kImcQ$+*eh zpP77cF>;I&D9)k*5=-|sck7~lLgD*S9{!A?wbAqFY8Olt+*78ULIP38PaJx{IjHch z*e9%AcYywehbY8q$L2bS5yyu#D?jZ^_?)jKHGG$yv(kMQxMk1VrE)^;xXhLJ{Gt)KO$&yl`x`V$db1nrnQpl68kwW$X zk2_Z=8>Y|1JsN)YRqGlhK5Jp2FgC01Fy)MEBteHKaFtdeo(3Z4Jbyjwu^M=5(Qhl{ zSK=u|0xCOi%-FvvcUvDgd@;Z+cH!N_TTLY=BS$ci7mIL>W*&*0?;nc^ycYM@N zE;X|(#=2uumvTfy|DW=%G@Pw%4JYNssa8~#Dk-fFh@m>ChE{88qcxMBgpv-1gh;fa zXob^~R%x|0RneGAL=sBGlyZ;Kp0t|qRAiDoYi0n@~ zsGHZ3Zx8UQ2bGBZ|HH0F9!r!9(x2_L($P^-YBjCAZ1hy_-PQ7bdtZ&Dmg;H*>X(E^ zkFM?69*5IF8$As)FzD#$IJIhGW@dKafbo~Bljf$0cE*a_xIM<(cRjG#`y&2uNBMJp z+zo&KqQ|rT8()lIt1gsx9KJi2+o3TY8f<97VltV=N(VR_Xl@pInT+jq`Q8zk~; z${{^ld-QK3afkV%a28?Rx_IT2&RTmk^Xq8BHSO(jJvG?M))N~0gh3r=`N~Ol>5Qs1 z<-M%8Y2#Fzz2wl0H;pF{)lb`6J!lYA22plM*qlOb zLG&K#jb7HMUfQydtbc#l^oX8Oa2dqO{FUl1eWRG)puKvGlqhb}F0s#7G(Aa0+u! zF&6;~++1~JQam(?`@N5HV{Ht(y~{_=gqye(M$M)~m=~wHrmPs(L){~}*;Z~&({N`r zfRWOW+p=CSxv7C@_qE@L_qt5H!(N>E(|4Lb1wD=2?6jUSHP z?L65@rx%6wEex>IiYUz-(A?U0vq!tAHo2TXI7WkExDbOL57*3IuRcb`!(N^Zkv729G!zpuN?z@)?&Nco@pMMY$RP?baa-2 zoNJr?0|9v=@}^6c%uo8=kzsP8xkZU)`*;36w|9vBirEM+t^Lq_DJxTDOY_C9FQtVS zWKrU@iWB1Lhw2VX+H*Z0EN|*kjzfDPkDH#+G-K^(&7G|25_-cGR|WU96af9W$%%kK zIBC<#$f}w|SBWkD@aukk0|SFV9BVEhFpNzZk3?NUiPD~J07=U&0KloK;WreW?oS)@ z3+^y)l^&P2Iw^=SEr1?3V_icBk8knF^=}R0=&M-?9*U6k`jf#RUrgo1bB7_n!aP<$ zwyyYt8&b07;_xm*brMOx`|0Y+# zT$z&clZ8vDdeKuf1PA#FFR>9pbMjfymLZ%#rutpP-A{Rtmc1R{w5XjanUcUP+2FnX zgWoT&eo4SWV@wM@cj6>TpcvkYK(rI=dw)P6Ba#A?UPFp0wL4f^=!#nZ3M8Xj@ZG^l zX4hty?o!fF3%T_xGkEs2h*FMbGHqx5lDctBdCk(ECCA2^X70E-f1v4j4c7>#_@*uq z(Ca5nhDhE_#e^8Sy87i$|0f30Ty-i=tasGU2vhJyS?;zqz3BAIKbvudM?jxZQ~xF> zzvm}4>0S(c;|ccmWB>vbocr$Z#*W0m8B$LbBIoU_AEq5K1JNuEYfO1uJAJ&_s-ioR zp`~BKuA7t}MoSSv#+$N2Fn1%-c#!*XpE>=Wc;S~{Zi!3lBGe|LGfC$ct6PR1A$h|U z0aFaN4kY;*s|80w;FLiYMP1Q@^{y^Hn;LP2{6Pm7rvu>2Ik{u6Ej}WGAcE2!Gg=3Z zgJz;Q700G88NWiu43in%wT0&AVPr7J6Mqwf;dz*=l$N7k#`*}Do(w=`^P91F73aI} zr-Ys8g4OqeAo-Hg9zR;eA2?1Gc~clLOz)3F)AdtzuD_%sjjS@FPI%{bsrgb}%|W2K z?2I){69cIxaI1}Ll=eaB?z3KU1T$EVE=@E<3_6^*5KL_vA|hhp)O<*BI06Q>my6M| z#CSslLZKGA1nf2atCxa%RB)G|gCrU%mPaSV;63NCoSc&}48bsx_Hk$e1k?wm=4($V z);Ih;XB2qYACy4HF+63(zSi|HKzj8+prTa4FGeo7vrY}F$X169Ufo~ZRc)_9#Uu>X zjuH{L7)|5CENJ*1k5e#5+CPCt!FsNW5x4Nd;f-WGE3*VIvf}jGLBqd*iM5^Wb^b@4 znZ~?HZ5Fd+16swv7NRAKblaIuSuuIJ92grNYXfW_f&BSmC-K;ZmR?;43c1}_i zK~x*YP^XH*D?Qmc-Z?|8I_d&q+imKGrP853M6OO$1k}2g3^ITG3I^EG$Au{~44s&y zNcpDHf_;hNPGh)LAdko(um^^F`kP$U%F|>Wb zY_Q8LtcE&G2-<+nHMMWa6Qrc3taF?{7TPX&-(LG`;d~~8+~5Z;GjfLII8S-C^?XXH z>2(^iAi@N4xXew*sy5sqxZ01)l~brZzS}BQbVbRFE0mCByLS>9{&k=9rO_`>&{pC_ zgb+}PWC?Fo(fbkE7@=y#qqJe%+9P@iKyJrnqSDga!_bH4&Opm+#xIO!Y?WTxPS%&| zf_zalM14V*4XANQa>%g>2op?F5i_6?sjqy(+}Ds0YtnNOfh(Scl=SBYaD|w5d)NRO zd@^!?wFPLEP_i`b=wBhavbj>oh^d~Awv?d2ML>aGx#Bm%=~$@oT?}T<6#7tGi0I#k z3~YO8W2Y}g`R2Hqg9ne_q#-Pd+^?Jf-Z}SwxTccxl^p%9e~!vP7%5M{+e;fPK$R)! zIv~xE^z?KTxEOw@0ZJ>ZudnB*H!^c1Ciz{{-Y`z1bbfw59ylbiz8>7z3ZyZ%8xJIn z_miyUEEo}1{5%-blBqeqqm4*v9UUFzGGZS$Mtw%?kK77G!QiRixlf?uj$2Xiy=v!= zFz#x(oymN)t&IGD0+)9C*XR|&r8{E&vmS4qr#@_oiaiD@963I~L(*0UcoCabn9%rDllWu~m@926A`ysjAc8VpkLSB0)pC?HsJG9}!BUkQ ztI;6`+%3AU*BB+kX&+ZtJYj%$GOVMAbTXN>3`^_8VckK`w)k|HsnyeA&1g0-I;Ewb zXS>7&RW-n6_uB&w85-h=;NCk?X?^B6a5ZkKa)`hMn{LVotuQkE+hGNTwx_ZKlnsh> zkY5A~`6;eL_UncFx0e}6fnTIB^m{QfJ=j55XY~9*3H4TC%3rOipqO(FdTafp>Lu`S znM(1(u`H{bz+EFKl^DSanf4kGTuP5rjHz0}RnIO3T~1)|28wxyp${0iC)PKBjD6Xe zXb^|*X?(M&GA%XmtZ-0RQ99!&lP;Ox=#~1u#dvIOoG3NC2ABv4DMR>vYS(&1pnVBS z_A~D7THMNqr?Ltj`sUJj%TK|Nv(a;93#&3Rn@*q1LCGBPo!LI(DYDyiF#9!+PN&cG zMv=pA9pkO9$>th-QL@!Q*~ zK{i%3&Jkv(%};>U4;jB#R#+*@kDjbweV<#r8-%OFgAsFhxv23&8)o!j(O7&J+?prlm z&%OavVeOD$jQ<|r_NMUi&u^xD56pt>{kcm_!^foaJKzx-&r@C=y?oCeIBrk4*Y}p1 z4^Jk35?hCgI1s0wGHg12XWmyS=gw(8B8*_jjDQW7#oxS<=X~{o!^hnPlcAmj*USU$ z2?!L)@+Bm@Nh?lUPajn6p#U7=vSRYQJLn za|9Iiy%tHSk69BVdNoI@`&t_+H08(Lba=+q*r%w+Xbet+LFUQNg0lVyY3wd#b zeHtzat??38sZnXc0QpFNL@9Z2j@)8Fn`Joe8dv-^OHWnhzd^^gMarHC<3IfbwRs=_ N7i=AESm%6y`xmc$!|MP5 diff --git a/docs/source/backend-delegates-xnnpack-reference.md b/docs/source/backend-delegates-xnnpack-reference.md index 8fe346680d4..d38c5af60fa 100644 --- a/docs/source/backend-delegates-xnnpack-reference.md +++ b/docs/source/backend-delegates-xnnpack-reference.md @@ -70,7 +70,7 @@ Since weight packing creates an extra copy of the weights inside XNNPACK, We fre When executing the XNNPACK subgraphs, we prepare the tensor inputs and outputs and feed them to the XNNPACK runtime graph. After executing the runtime graph, the output pointers are filled with the computed tensors. #### **Profiling** -We have enabled basic profiling for the XNNPACK delegate that can be enabled with the compiler flag `-DEXECUTORCH_ENABLE_EVENT_TRACER` (add `-DENABLE_XNNPACK_PROFILING` for additional details). With ExecuTorch's Developer Tools integration, you can also now use the Developer Tools to profile the model. You can follow the steps in [Using the ExecuTorch Developer Tools to Profile a Model](https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial) on how to profile ExecuTorch models and use Developer Tools' Inspector API to view XNNPACK's internal profiling information. An example implementation is available in the `xnn_executor_runner` (see [tutorial here](tutorial-xnnpack-delegate-lowering.md#profiling)). +We have enabled basic profiling for the XNNPACK delegate that can be enabled with the compiler flag `-DEXECUTORCH_ENABLE_EVENT_TRACER` (add `-DENABLE_XNNPACK_PROFILING` for additional details). With ExecuTorch's Developer Tools integration, you can also now use the Developer Tools to profile the model. You can follow the steps in [Using the ExecuTorch Developer Tools to Profile a Model](https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial) on how to profile ExecuTorch models and use Developer Tools' Inspector API to view XNNPACK's internal profiling information. An example implementation is available in the `executor_runner` (see [tutorial here](tutorial-xnnpack-delegate-lowering.md#profiling)). [comment]: <> (TODO: Refactor quantizer to a more official quantization doc) diff --git a/docs/source/backends-mps.md b/docs/source/backends-mps.md index 5be3dc72403..0d86c8e5c64 100644 --- a/docs/source/backends-mps.md +++ b/docs/source/backends-mps.md @@ -42,12 +42,6 @@ In order to be able to successfully build and run a model using the MPS backend ***Step 1.*** Please finish tutorial [Getting Started](getting-started.md). -***Step 2.*** Install dependencies needed to lower MPS delegate: - - ```bash - ./backends/apple/mps/install_requirements.sh - ``` - ## Build ### AOT (Ahead-of-time) Components @@ -97,7 +91,7 @@ I 00:00:00.122615 executorch:mps_executor_runner.mm:501] Model verified successf ### [Optional] Run the generated model directly using pybind 1. Make sure `pybind` MPS support was installed: ```bash -./install_executorch.sh --pybind mps +CMAKE_ARGS="-DEXECUTORCH_BUILD_MPS=ON" ./install_executorch.sh ``` 2. Run the `mps_example` script to trace the model and run it directly from python: ```bash diff --git a/docs/source/tutorial-xnnpack-delegate-lowering.md b/docs/source/tutorial-xnnpack-delegate-lowering.md index 12793533766..bbda61aadd8 100644 --- a/docs/source/tutorial-xnnpack-delegate-lowering.md +++ b/docs/source/tutorial-xnnpack-delegate-lowering.md @@ -141,7 +141,7 @@ Note in the example above, The generated model file will be named `[model_name]_xnnpack_[qs8/fp32].pte` depending on the arguments supplied. ## Running the XNNPACK Model with CMake -After exporting the XNNPACK Delegated model, we can now try running it with example inputs using CMake. We can build and use the xnn_executor_runner, which is a sample wrapper for the ExecuTorch Runtime and XNNPACK Backend. We first begin by configuring the CMake build like such: +After exporting the XNNPACK Delegated model, we can now try running it with example inputs using CMake. We can build and use the executor_runner, which is a sample wrapper for the ExecuTorch Runtime. The XNNPACK Backend is enabled via the compilation flag `-DEXECUTORCH_BUILD_XNNPACK=ON`. We first begin by configuring the CMake build like such: ```bash # cd to the root of executorch repo cd executorch @@ -168,15 +168,15 @@ Then you can build the runtime componenets with cmake --build cmake-out -j9 --target install --config Release ``` -Now you should be able to find the executable built at `./cmake-out/backends/xnnpack/xnn_executor_runner` you can run the executable with the model you generated as such +Now you should be able to find the executable built at `./cmake-out/executor_runner` you can run the executable with the model you generated as such ```bash -./cmake-out/backends/xnnpack/xnn_executor_runner --model_path=./mv2_xnnpack_fp32.pte +./cmake-out/executor_runner --model_path=./mv2_xnnpack_fp32.pte # or to run the quantized variant -./cmake-out/backends/xnnpack/xnn_executor_runner --model_path=./mv2_xnnpack_q8.pte +./cmake-out/executor_runner --model_path=./mv2_xnnpack_q8.pte ``` ## Building and Linking with the XNNPACK Backend You can build the XNNPACK backend [CMake target](https://github.com/pytorch/executorch/blob/main/backends/xnnpack/CMakeLists.txt#L83), and link it with your application binary such as an Android or iOS application. For more information on this you may take a look at this [resource](using-executorch-android.md) next. ## Profiling -To enable profiling in the `xnn_executor_runner` pass the flags `-DEXECUTORCH_ENABLE_EVENT_TRACER=ON` and `-DEXECUTORCH_BUILD_DEVTOOLS=ON` to the build command (add `-DENABLE_XNNPACK_PROFILING=ON` for additional details). This will enable ETDump generation when running the inference and enables command line flags for profiling (see `xnn_executor_runner --help` for details). +To enable profiling in the `executor_runner` pass the flags `-DEXECUTORCH_ENABLE_EVENT_TRACER=ON` and `-DEXECUTORCH_BUILD_DEVTOOLS=ON` to the build command (add `-DENABLE_XNNPACK_PROFILING=ON` for additional details). This will enable ETDump generation when running the inference and enables command line flags for profiling (see `executor_runner --help` for details). diff --git a/docs/source/using-executorch-building-from-source.md b/docs/source/using-executorch-building-from-source.md index af8ebfe6387..a9777425bc7 100644 --- a/docs/source/using-executorch-building-from-source.md +++ b/docs/source/using-executorch-building-from-source.md @@ -64,25 +64,15 @@ Or alternatively, [install conda on your machine](https://conda.io/projects/cond ./install_executorch.sh ``` - Use the [`--pybind` flag](https://github.com/pytorch/executorch/blob/main/install_executorch.sh#L26-L29) to install with pybindings and dependencies for other backends. - ```bash - ./install_executorch.sh --pybind - - # Example: pybindings with CoreML *only* - ./install_executorch.sh --pybind coreml - - # Example: pybinds with CoreML *and* XNNPACK - ./install_executorch.sh --pybind coreml xnnpack - ``` + Not all backends are built into the pip wheel by default. You can link these missing/experimental backends by turning on the corresponding cmake flag. For example, to include the MPS backend: - By default, `./install_executorch.sh` command installs pybindings for XNNPACK. To disable any pybindings altogether: - ```bash - ./install_executorch.sh --pybind off - ``` + ```bash + CMAKE_ARGS="-DEXECUTORCH_BUILD_MPS=ON" ./install_executorch.sh + ``` For development mode, run the command with `--editable`, which allows us to modify Python source code and see changes reflected immediately. ```bash - ./install_executorch.sh --editable [--pybind xnnpack] + ./install_executorch.sh --editable # Or you can directly do the following if dependencies are already installed # either via a previous invocation of `./install_executorch.sh` or by explicitly installing requirements via `./install_requirements.sh` first. diff --git a/docs/source/using-executorch-ios.md b/docs/source/using-executorch-ios.md index 7238a62af79..508669112f1 100644 --- a/docs/source/using-executorch-ios.md +++ b/docs/source/using-executorch-ios.md @@ -11,8 +11,7 @@ The ExecuTorch Runtime for iOS and macOS is distributed as a collection of prebu * `backend_mps` - MPS backend * `backend_xnnpack` - XNNPACK backend * `kernels_custom` - Custom kernels for LLMs -* `kernels_optimized` - Optimized kernels -* `kernels_portable` - Portable kernels (naive implementation used as a reference) +* `kernels_optimized` - Accelerated generic CPU kernels * `kernels_quantized` - Quantized kernels Link your binary with the ExecuTorch runtime and any backends or kernels used by the exported ML model. It is recommended to link the core runtime to the components that use ExecuTorch directly, and link kernels and backends against the main app target. @@ -51,7 +50,7 @@ let package = Package( name: "YourPackageName", platforms: [ .iOS(.v17), - .macOS(.v10_15), + .macOS(.v12), ], products: [ .library(name: "YourPackageName", targets: ["YourTargetName"]), @@ -66,7 +65,7 @@ let package = Package( dependencies: [ .product(name: "executorch", package: "executorch"), .product(name: "backend_xnnpack", package: "executorch"), - .product(name: "kernels_portable", package: "executorch"), + .product(name: "kernels_optimized", package: "executorch"), // Add other backends and kernels as needed. ]), ] @@ -113,9 +112,6 @@ python3 -m venv .venv && source .venv/bin/activate && pip install --upgrade pip # CoreML-only requirements: ./backends/apple/coreml/scripts/install_requirements.sh - -# MPS-only requirements: -./backends/apple/mps/install_requirements.sh ``` 5. Install [CMake](https://cmake.org): diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py index c663b150fd8..5449ced09b9 100644 --- a/examples/arm/aot_arm_compiler.py +++ b/examples/arm/aot_arm_compiler.py @@ -668,12 +668,12 @@ def save_bpte_program(exec_prog, original_model: torch.nn.Module, output_name: s ) # Generate BundledProgram + output_dir = os.path.dirname(output_name) + os.makedirs(output_dir, exist_ok=True) save_bundled_program(exec_prog, method_test_suites, output_name) -def quantize_model( - exported_program, args, model: torch.nn.Module, example_inputs, compile_spec -): +def quantize_model(args, model: torch.nn.Module, example_inputs, compile_spec): model_int8 = quantize( model, args.model_name, @@ -705,7 +705,7 @@ def to_edge_TOSA_delegate( model_int8 = None if args.quantize: model_int8, exported_program = quantize_model( - exported_program, args, model, example_inputs, compile_spec + args, model, example_inputs, compile_spec ) model = model_int8 @@ -741,7 +741,7 @@ def to_edge_no_delegate(exported_program, args, model: torch.nn.Module, example_ args.memory_mode, ) model, exported_program = quantize_model( - exported_program, args, model, example_inputs, compile_spec + args, model, example_inputs, compile_spec ) model_int8 = model diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj index dcd8d5c6dff..ddf7f32f043 100644 --- a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj +++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj @@ -547,7 +547,7 @@ ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; - shellScript = "set -e\n\nif ! command -v cmake &> /dev/null\nthen\n echo \"Cmake not found, please install Cmake. \\n1. Download Cmake.app from https://cmake.org/download with version > 3.19. \\n2. Install it to Applications/ folder and run `sudo /Applications/CMake.app/Contents/bin/cmake-gui --install` to install CMake commandline tools.\"\n exit 1\nfi\n\nCMAKE_DIR=\"$TEMP_DIR/cmake\"\nrm -rf \"$CMAKE_DIR\"\n\nPLATFORM=\"SIMULATORARM64\"\nDEPLOYMENT_TARGET=\"17.0\"\n\nif [[ \"$PLATFORM_NAME\" == *\"iphoneos\"* ]]; then\n PLATFORM=\"OS64\"\nelif [[ \"$PLATFORM_NAME\" == *\"macos\"* ]]; then\n PLATFORM=\"MAC_ARM64\"\n DEPLOYMENT_TARGET=\"10.15\"\nfi\n\ncmake_build() {\n local src_dir=$1\n local target=$2\n shift 2\n local extra_args=(\"$@\")\n local build_dir=\"$CMAKE_DIR/build/$(basename \"$src_dir\")\"\n\n mkdir -p \"$build_dir\" && cd \"$build_dir\"\n\n if [[ \"$PLATFORM\" == \"MAC_ARM64\" ]]; then\n extra_args+=(-DCMAKE_INSTALL_BUNDLEDIR=\"${CMAKE_DIR}/bin\")\n extra_args+=(-DCMAKE_MACOSX_BUNDLE=OFF)\n fi\n cmake -G Xcode \\\n -DCMAKE_BUILD_TYPE=\"Release\" \\\n -DCMAKE_CXX_STANDARD=17 \\\n -DCMAKE_TOOLCHAIN_FILE=\"$SRCROOT/../../../../third-party/ios-cmake/ios.toolchain.cmake\" \\\n -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD=\"c++17\" \\\n -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY=\"libc++\" \\\n -DPLATFORM=\"$PLATFORM\" \\\n -DDEPLOYMENT_TARGET=\"$DEPLOYMENT_TARGET\" \\\n -DCMAKE_INSTALL_PREFIX=\"$CMAKE_DIR\" \\\n \"${extra_args[@]}\" \\\n \"$src_dir\"\n cmake --build . --config \"Release\" --target \"$target\"\n if [[ \"$target\" == \"install\" ]]; then\n cmake --install . --prefix \"$CMAKE_DIR\"\n fi\n}\n\ncmake_build \"$SRCROOT/../../../../extension/llm/tokenizers/third-party/abseil-cpp\" \"install\" \\\n -DABSL_PROPAGATE_CXX_STD=ON\n\ncmake_build \"$SRCROOT/../../../../extension/llm/tokenizers/third-party/re2\" \"install\"\n\ncmake_build \"$SRCROOT/../../../../extension/llm/tokenizers/third-party/pcre2\" \"install\" \\\n -DPCRE2_BUILD_PCRE2_8=ON \\\n -DPCRE2_BUILD_PCRE2_16=OFF \\\n -DPCRE2_BUILD_PCRE2_32=OFF \\\n -DPCRE2_BUILD_TESTS=OFF \\\n -DPCRE2_BUILD_PCRE2GREP=OFF \\\n -DPCRE2_BUILD_PCRE2TEST=OFF \\\n -DPCRE2_BUILD_PCRE2GPERF=OFF \\\n -DPCRE2_BUILD_DOCS=OFF \\\n -DPCRE2_BUILD_LIBPCRE2_PDB=OFF\n\ncmake_build \"$SRCROOT/../../../../extension/llm/tokenizers/third-party/sentencepiece\" \"sentencepiece-static\" \\\n -DSPM_ENABLE_SHARED=OFF\n \ncmake_build \"$SRCROOT/../../../../extension/llm/tokenizers/third-party/llama.cpp-unicode\" \"install\"\n \n# Include the single header for json.\nmkdir -p \"$CMAKE_DIR/include/nlohmann\"\ncp \"$SRCROOT/../../../../extension/llm/tokenizers/third-party/json/single_include/nlohmann/json.hpp\" \"$CMAKE_DIR/include/nlohmann/json.hpp\"\n\necho \"$(find $CMAKE_DIR/lib -name \"*.a\" | sed -E 's|^.*/lib([^/]+)\\.a|-l\\1|g' | tr '\\n' ' ')\" > \"$CMAKE_DIR/linker_flags\"\n"; + shellScript = "set -e\n\nif ! command -v cmake &> /dev/null\nthen\n echo \"Cmake not found, please install Cmake. \\n1. Download Cmake.app from https://cmake.org/download with version > 3.19. \\n2. Install it to Applications/ folder and run `sudo /Applications/CMake.app/Contents/bin/cmake-gui --install` to install CMake commandline tools.\"\n exit 1\nfi\n\nCMAKE_DIR=\"$TEMP_DIR/cmake\"\nrm -rf \"$CMAKE_DIR\"\n\nPLATFORM=\"SIMULATORARM64\"\nDEPLOYMENT_TARGET=\"17.0\"\n\nif [[ \"$PLATFORM_NAME\" == *\"iphoneos\"* ]]; then\n PLATFORM=\"OS64\"\nelif [[ \"$PLATFORM_NAME\" == *\"macos\"* ]]; then\n PLATFORM=\"MAC_ARM64\"\n DEPLOYMENT_TARGET=\"12.0\"\nfi\n\ncmake_build() {\n local src_dir=$1\n local target=$2\n shift 2\n local extra_args=(\"$@\")\n local build_dir=\"$CMAKE_DIR/build/$(basename \"$src_dir\")\"\n\n mkdir -p \"$build_dir\" && cd \"$build_dir\"\n\n if [[ \"$PLATFORM\" == \"MAC_ARM64\" ]]; then\n extra_args+=(-DCMAKE_INSTALL_BUNDLEDIR=\"${CMAKE_DIR}/bin\")\n extra_args+=(-DCMAKE_MACOSX_BUNDLE=OFF)\n fi\n cmake -G Xcode \\\n -DCMAKE_BUILD_TYPE=\"Release\" \\\n -DCMAKE_CXX_STANDARD=17 \\\n -DCMAKE_TOOLCHAIN_FILE=\"$SRCROOT/../../../../third-party/ios-cmake/ios.toolchain.cmake\" \\\n -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD=\"c++17\" \\\n -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY=\"libc++\" \\\n -DPLATFORM=\"$PLATFORM\" \\\n -DDEPLOYMENT_TARGET=\"$DEPLOYMENT_TARGET\" \\\n -DCMAKE_INSTALL_PREFIX=\"$CMAKE_DIR\" \\\n \"${extra_args[@]}\" \\\n \"$src_dir\"\n cmake --build . --config \"Release\" --target \"$target\"\n if [[ \"$target\" == \"install\" ]]; then\n cmake --install . --prefix \"$CMAKE_DIR\"\n fi\n}\n\ncmake_build \"$SRCROOT/../../../../extension/llm/tokenizers/third-party/abseil-cpp\" \"install\" \\\n -DABSL_PROPAGATE_CXX_STD=ON\n\ncmake_build \"$SRCROOT/../../../../extension/llm/tokenizers/third-party/re2\" \"install\"\n\ncmake_build \"$SRCROOT/../../../../extension/llm/tokenizers/third-party/pcre2\" \"install\" \\\n -DPCRE2_BUILD_PCRE2_8=ON \\\n -DPCRE2_BUILD_PCRE2_16=OFF \\\n -DPCRE2_BUILD_PCRE2_32=OFF \\\n -DPCRE2_BUILD_TESTS=OFF \\\n -DPCRE2_BUILD_PCRE2GREP=OFF \\\n -DPCRE2_BUILD_PCRE2TEST=OFF \\\n -DPCRE2_BUILD_PCRE2GPERF=OFF \\\n -DPCRE2_BUILD_DOCS=OFF \\\n -DPCRE2_BUILD_LIBPCRE2_PDB=OFF\n\ncmake_build \"$SRCROOT/../../../../extension/llm/tokenizers/third-party/sentencepiece\" \"sentencepiece-static\" \\\n -DSPM_ENABLE_SHARED=OFF\n \ncmake_build \"$SRCROOT/../../../../extension/llm/tokenizers/third-party/llama.cpp-unicode\" \"install\"\n \n# Include the single header for json.\nmkdir -p \"$CMAKE_DIR/include/nlohmann\"\ncp \"$SRCROOT/../../../../extension/llm/tokenizers/third-party/json/single_include/nlohmann/json.hpp\" \"$CMAKE_DIR/include/nlohmann/json.hpp\"\n\necho \"$(find $CMAKE_DIR/lib -name \"*.a\" | sed -E 's|^.*/lib([^/]+)\\.a|-l\\1|g' | tr '\\n' ' ')\" > \"$CMAKE_DIR/linker_flags\"\n"; }; /* End PBXShellScriptBuildPhase section */ diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md index b16f27410af..47352607bca 100644 --- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md +++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md @@ -36,7 +36,6 @@ Install dependencies ``` ./install_executorch.sh -./backends/apple/mps/install_requirements.sh ``` ## Prepare Models diff --git a/examples/demo-apps/react-native/rnllama/README.md b/examples/demo-apps/react-native/rnllama/README.md index f017c8bfa22..7729f7a153a 100644 --- a/examples/demo-apps/react-native/rnllama/README.md +++ b/examples/demo-apps/react-native/rnllama/README.md @@ -26,7 +26,7 @@ A React Native mobile application for running LLaMA language models using ExecuT 3. Pull submodules: `git submodule sync && git submodule update --init` -4. Install dependencies: `./install_executorch.sh --pybind xnnpack && ./examples/models/llama/install_requirements.sh` +4. Install dependencies: `./install_executorch.sh && ./examples/models/llama/install_requirements.sh` 5. Follow the instructions in the [README](https://github.com/pytorch/executorch/blob/main/examples/models/llama/README.md#option-a-download-and-export-llama32-1b3b-model) to export a model as `.pte` diff --git a/examples/demo-apps/react-native/rnllama/ios/rnllama.xcodeproj/project.pbxproj b/examples/demo-apps/react-native/rnllama/ios/rnllama.xcodeproj/project.pbxproj index 5d9d01cfff7..1a56daafaea 100644 --- a/examples/demo-apps/react-native/rnllama/ios/rnllama.xcodeproj/project.pbxproj +++ b/examples/demo-apps/react-native/rnllama/ios/rnllama.xcodeproj/project.pbxproj @@ -557,7 +557,7 @@ ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; - shellScript = "set -e\n\nif ! command -v cmake &> /dev/null\nthen\n echo \"cmake not found, please install cmake. \\n1. Download Cmake.app from https://cmake.org/download with version > 3.19. \\n2. Install it to Applications/ folder and run sudo /Applications/CMake.app/Contents/bin/cmake-gui --install to install CMake commandline tools.\"\n exit 1\nfi\n\nCMAKE_DIR=\"$TEMP_DIR/cmake\"\nrm -rf \"$CMAKE_DIR\"\n\nPLATFORM=\"SIMULATORARM64\"\nDEPLOYMENT_TARGET=\"17.0\"\n\nif [[ \"$PLATFORM_NAME\" == *\"iphoneos\"* ]]; then\n PLATFORM=\"OS64\"\nelif [[ \"$PLATFORM_NAME\" == *\"macos\"* ]]; then\n PLATFORM=\"MAC_ARM64\"\n DEPLOYMENT_TARGET=\"10.15\"\nfi\n\ncmake_build() {\n local src_dir=$1\n shift\n local extra_args=(\"$@\")\n local build_dir=\"$CMAKE_DIR/build/$(basename \"$src_dir\")\"\n\n mkdir -p \"$build_dir\" && cd \"$build_dir\"\n cmake -G Xcode \\\n -DCMAKE_BUILD_TYPE=\"Release\" \\\n -DCMAKE_CXX_STANDARD=17 \\\n -DCMAKE_TOOLCHAIN_FILE=\"$PROJECT_DIR/../../../../../third-party/ios-cmake/ios.toolchain.cmake\" \\\n -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD=\"c++17\" \\\n -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY=\"libc++\" \\\n -DPLATFORM=\"$PLATFORM\" \\\n -DDEPLOYMENT_TARGET=\"$DEPLOYMENT_TARGET\" \\\n \"${extra_args[@]}\" \\\n \"$src_dir\"\n cmake --build . --config \"Release\"\n cmake --install . --prefix \"$CMAKE_DIR\"\n}\n\ncmake_build \"$PROJECT_DIR/../../../../../extension/llm/third-party/abseil-cpp\" \\\n -DABSL_PROPAGATE_CXX_STD=ON\n \ncmake_build \"$PROJECT_DIR/../../../../../extension/llm/third-party/re2\" \\\n -DCMAKE_PREFIX_PATH=\"$CMAKE_DIR/lib/cmake/absl\"\n \ncmake_build \"$PROJECT_DIR/../../../../../extension/llm/third-party/sentencepiece\" \\\n -DSPM_ENABLE_SHARED=OFF\n\necho \"$(find $CMAKE_DIR/lib -name \"*.a\" | sed -E 's|^.*/lib([^/]+)\\.a|-l\\1|g' | tr '\\n' ' ')\" > \"$CMAKE_DIR/linker_flags\"\n\n\n\n"; + shellScript = "set -e\n\nif ! command -v cmake &> /dev/null\nthen\n echo \"cmake not found, please install cmake. \\n1. Download Cmake.app from https://cmake.org/download with version > 3.19. \\n2. Install it to Applications/ folder and run sudo /Applications/CMake.app/Contents/bin/cmake-gui --install to install CMake commandline tools.\"\n exit 1\nfi\n\nCMAKE_DIR=\"$TEMP_DIR/cmake\"\nrm -rf \"$CMAKE_DIR\"\n\nPLATFORM=\"SIMULATORARM64\"\nDEPLOYMENT_TARGET=\"17.0\"\n\nif [[ \"$PLATFORM_NAME\" == *\"iphoneos\"* ]]; then\n PLATFORM=\"OS64\"\nelif [[ \"$PLATFORM_NAME\" == *\"macos\"* ]]; then\n PLATFORM=\"MAC_ARM64\"\n DEPLOYMENT_TARGET=\"12.0\"\nfi\n\ncmake_build() {\n local src_dir=$1\n shift\n local extra_args=(\"$@\")\n local build_dir=\"$CMAKE_DIR/build/$(basename \"$src_dir\")\"\n\n mkdir -p \"$build_dir\" && cd \"$build_dir\"\n cmake -G Xcode \\\n -DCMAKE_BUILD_TYPE=\"Release\" \\\n -DCMAKE_CXX_STANDARD=17 \\\n -DCMAKE_TOOLCHAIN_FILE=\"$PROJECT_DIR/../../../../../third-party/ios-cmake/ios.toolchain.cmake\" \\\n -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD=\"c++17\" \\\n -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY=\"libc++\" \\\n -DPLATFORM=\"$PLATFORM\" \\\n -DDEPLOYMENT_TARGET=\"$DEPLOYMENT_TARGET\" \\\n \"${extra_args[@]}\" \\\n \"$src_dir\"\n cmake --build . --config \"Release\"\n cmake --install . --prefix \"$CMAKE_DIR\"\n}\n\ncmake_build \"$PROJECT_DIR/../../../../../extension/llm/third-party/abseil-cpp\" \\\n -DABSL_PROPAGATE_CXX_STD=ON\n \ncmake_build \"$PROJECT_DIR/../../../../../extension/llm/third-party/re2\" \\\n -DCMAKE_PREFIX_PATH=\"$CMAKE_DIR/lib/cmake/absl\"\n \ncmake_build \"$PROJECT_DIR/../../../../../extension/llm/third-party/sentencepiece\" \\\n -DSPM_ENABLE_SHARED=OFF\n\necho \"$(find $CMAKE_DIR/lib -name \"*.a\" | sed -E 's|^.*/lib([^/]+)\\.a|-l\\1|g' | tr '\\n' ' ')\" > \"$CMAKE_DIR/linker_flags\"\n\n\n\n"; }; F7CCCCE770493310D0125117 /* [Expo] Configure project */ = { isa = PBXShellScriptBuildPhase; diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md index 041c7bb1d97..52d7baeabbf 100644 --- a/examples/models/llama/README.md +++ b/examples/models/llama/README.md @@ -148,7 +148,7 @@ Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus ## Step 1: Setup > :warning: **double check your python environment**: make sure `conda activate ` is run before all the bash and python scripts. -1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. For installation run `./install_executorch.sh --pybind xnnpack` +1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. For installation run `./install_executorch.sh` 2. Run `examples/models/llama/install_requirements.sh` to install a few dependencies. @@ -164,7 +164,7 @@ Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus ``` # No quantization # Set these paths to point to the downloaded files -LLAMA_CHECKPOINT=path/to/checkpoint.pth +LLAMA_CHECKPOINT=path/to/consolidated.00.pth LLAMA_PARAMS=path/to/params.json python -m examples.models.llama.export_llama \ @@ -186,7 +186,7 @@ For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/exec ``` # SpinQuant # Set these paths to point to the exported files -LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/checkpoint.pth +LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/consolidated.00.pth.pth LLAMA_PARAMS=path/to/spinquant/params.json python -m examples.models.llama.export_llama \ @@ -215,7 +215,7 @@ For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co ``` # QAT+LoRA # Set these paths to point to the exported files -LLAMA_QUANTIZED_CHECKPOINT=path/to/qlora/checkpoint.pth +LLAMA_QUANTIZED_CHECKPOINT=path/to/qlora/consolidated.00.pth.pth LLAMA_PARAMS=path/to/qlora/params.json python -m examples.models.llama.export_llama \ @@ -248,7 +248,7 @@ You can export and run the original Llama 3 8B instruct model. 2. Export model and generate `.pte` file ``` python -m examples.models.llama.export_llama \ - --checkpoint \ + --checkpoint \ -p \ -kv \ --use_sdpa_with_kv_cache \ @@ -396,7 +396,7 @@ First export your model for lowbit quantization (step 2 above): ``` # Set these paths to point to the downloaded files -LLAMA_CHECKPOINT=path/to/checkpoint.pth +LLAMA_CHECKPOINT=path/to/consolidated.00.pth.pth LLAMA_PARAMS=path/to/params.json # Set low-bit quantization parameters @@ -476,7 +476,7 @@ We use [LM Eval](https://github.com/EleutherAI/lm-evaluation-harness) to evaluat For base models, use the following example command to calculate its perplexity based on WikiText. ``` python -m examples.models.llama.eval_llama \ - -c \ + -c \ -p \ -t \ -kv \ @@ -489,7 +489,7 @@ python -m examples.models.llama.eval_llama \ For instruct models, use the following example command to calculate its MMLU score. ``` python -m examples.models.llama.eval_llama \ - -c \ + -c \ -p \ -t \ -kv \ @@ -528,7 +528,7 @@ This example tries to reuse the Python code, with minimal modifications to make git clean -xfd pip uninstall executorch ./install_executorch.sh --clean -./install_executorch.sh --pybind xnnpack +./install_executorch.sh ``` - If you encounter `pthread` related issues during link time, add `pthread` in `target_link_libraries` in `CMakeLists.txt` - On Mac, if there is linking error in Step 4 with error message like diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py index d2e2d5396d3..19fef857865 100644 --- a/examples/models/llama/source_transformation/quantize.py +++ b/examples/models/llama/source_transformation/quantize.py @@ -8,7 +8,7 @@ import re from functools import partial from pathlib import Path -from typing import Any, Dict, Optional +from typing import Dict, Optional import torch import torch.nn as nn @@ -16,8 +16,6 @@ from executorch.extension.llm.export.builder import DType -from sentencepiece import SentencePieceProcessor - try: from fairseq2.nn.embedding import ( @@ -57,7 +55,7 @@ def quantize( # noqa C901 Args: model: The model to quantize. - qmode: The quantization mode, e.g. int8, 8da4w, 8da4w-gptq. + qmode: The quantization mode, e.g. int8, 8da4w. computation_dtype: The dtype that ops are performed in (the resulting dtype of dequantization). Also the dtype of the rest of the non-quantized compoents of the model. checkpoint_dtype: The dtype of the checkpoint, this arg exists since it is more accurate to @@ -161,58 +159,6 @@ def quantize( # noqa C901 if verbose: print("quantized model:", model) return model - elif qmode == "8da4w-gptq": - # Check for required args - required_args: Optional[Any] = [ - group_size, - calibration_limit, - calibration_seq_length, - ] - if any(arg is None for arg in required_args): - raise Exception( - "For 8da4w-gptq quantization, group size, calibration limit and calibration sequence length must be specified." - ) - if calibration_tasks is None: - calibration_tasks = ["wikitext"] - - try: - # torchao 0.3+ - from torchao._models._eval import InputRecorder - except ImportError: - from torchao.quantization.GPTQ import InputRecorder # pyre-ignore - - from torchao.quantization.quant_api import Int8DynActInt4WeightGPTQQuantizer - - if tokenizer_path is None: - assert checkpoint_path is not None, "checkpoint_path must be specified" - tokenizer_path = checkpoint_path.parent / "tokenizer.model" - assert tokenizer_path.is_file(), tokenizer_path - tokenizer = SentencePieceProcessor( # pyre-ignore[28] - model_file=str(tokenizer_path) - ) - - inputs = ( - InputRecorder( # pyre-fixme[16] - tokenizer, - calibration_seq_length, - None, # input_prep_func - pad_calibration_inputs, - model.vocab_size, - ) - .record_inputs( - calibration_tasks, - calibration_limit, - ) - .get_inputs() - ) - - gptq_quantizer = Int8DynActInt4WeightGPTQQuantizer( - blocksize, - percdamp, - group_size, - ) # TODO: separate computation and checkpoint dtype for GPTQ. - model = gptq_quantizer.quantize(model, inputs) - return model elif qmode == "vulkan_4w": from executorch.backends.vulkan._passes import VkInt4WeightOnlyQuantizer diff --git a/examples/models/phi-3-mini/README.md b/examples/models/phi-3-mini/README.md index f52f2a3a06d..3546ce7f1f2 100644 --- a/examples/models/phi-3-mini/README.md +++ b/examples/models/phi-3-mini/README.md @@ -3,7 +3,7 @@ This example demonstrates how to run a [Phi-3-mini](https://huggingface.co/micro # Instructions ## Step 1: Setup -1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. For installation run `./install_executorch.sh --pybind xnnpack` +1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. For installation run `./install_executorch.sh` 2. Currently, we support transformers v4.44.2. Install transformers with the following command: ``` pip uninstall -y transformers ; pip install transformers==4.44.2 diff --git a/examples/models/qwen3/README.md b/examples/models/qwen3/README.md index 65923fb020c..a589d27c19d 100644 --- a/examples/models/qwen3/README.md +++ b/examples/models/qwen3/README.md @@ -88,4 +88,4 @@ cmake-out/examples/models/llama/llama_main To run the model on an example iOS or Android app, see the Llama README's [Step 5: Build Mobile apps](../llama/README.md#step-5-build-mobile-apps) section. ### FAQ -For more help with exporting or running this model, feel free to ask in our [discord channel](https://lnkd.in/gWCM4ViK). +For more help with exporting or running this model, feel free to ask in our [discord channel](https://discord.gg/UEjkY9Zs). diff --git a/examples/qualcomm/oss_scripts/deit.py b/examples/qualcomm/oss_scripts/deit.py new file mode 100644 index 00000000000..5482a77a166 --- /dev/null +++ b/examples/qualcomm/oss_scripts/deit.py @@ -0,0 +1,148 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import getpass +import json +import os +from multiprocessing.connection import Client + +import numpy as np +from executorch.backends.qualcomm._passes.qnn_pass_manager import ( + get_capture_program_passes, +) +from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype +from executorch.examples.qualcomm.utils import ( + build_executorch_binary, + get_imagenet_dataset, + make_output_dir, + parse_skip_delegation_node, + setup_common_args_and_variables, + SimpleADB, + topk_accuracy, +) +from transformers import AutoConfig, AutoModelForImageClassification + + +def get_instance(): + module = ( + AutoModelForImageClassification.from_pretrained( + "facebook/deit-base-distilled-patch16-224" + ) + .eval() + .to("cpu") + ) + + return module + + +def main(args): + skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) + + os.makedirs(args.artifact, exist_ok=True) + config = AutoConfig.from_pretrained("facebook/deit-base-distilled-patch16-224") + data_num = 100 + height = config.image_size + width = config.image_size + inputs, targets, input_list = get_imagenet_dataset( + dataset_path=f"{args.dataset}", + data_size=data_num, + image_shape=(height, width), + crop_size=(height, width), + ) + + # Get the Deit model. + model = get_instance() + pte_filename = "deit_qnn" + + # lower to QNN + passes_job = get_capture_program_passes() + build_executorch_binary( + model, + inputs[0], + args.model, + f"{args.artifact}/{pte_filename}", + dataset=inputs, + skip_node_id_set=skip_node_id_set, + skip_node_op_set=skip_node_op_set, + quant_dtype=QuantDtype.use_8a8w, + passes_job=passes_job, + shared_buffer=args.shared_buffer, + ) + + if args.compile_only: + return + + workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/{pte_filename}" + pte_path = f"{args.artifact}/{pte_filename}.pte" + + adb = SimpleADB( + qnn_sdk=os.getenv("QNN_SDK_ROOT"), + build_path=f"{args.build_folder}", + pte_path=pte_path, + workspace=workspace, + device_id=args.device, + host_id=args.host, + soc_model=args.model, + ) + adb.push(inputs=inputs, input_list=input_list) + adb.execute() + + # collect output data + output_data_folder = f"{args.artifact}/outputs" + make_output_dir(output_data_folder) + + adb.pull(output_path=args.artifact) + + # top-k analysis + predictions = [] + for i in range(data_num): + predictions.append( + np.fromfile( + os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32 + ) + ) + + k_val = [1, 5] + topk = [topk_accuracy(predictions, targets, k).item() for k in k_val] + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({f"top_{k}": topk[i] for i, k in enumerate(k_val)})) + else: + for i, k in enumerate(k_val): + print(f"top_{k}->{topk[i]}%") + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts and output by this example. Default ./deit_qnn", + default="./deit_qnn", + type=str, + ) + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=True, + ) + + args = parser.parse_args() + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/oss_scripts/efficientnet.py b/examples/qualcomm/oss_scripts/efficientnet.py new file mode 100644 index 00000000000..b11ad7abc47 --- /dev/null +++ b/examples/qualcomm/oss_scripts/efficientnet.py @@ -0,0 +1,145 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import json +import logging +import os +from multiprocessing.connection import Client + +import numpy as np + +import torch +from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype +from executorch.examples.qualcomm.utils import ( + build_executorch_binary, + get_imagenet_dataset, + make_output_dir, + parse_skip_delegation_node, + setup_common_args_and_variables, + SimpleADB, + topk_accuracy, +) +from transformers import AutoModelForImageClassification + + +def main(args): + skip_node_id_set, skip_node_op_set = parse_skip_delegation_node(args) + + # ensure the working directory exist. + os.makedirs(args.artifact, exist_ok=True) + + if not args.compile_only and args.device is None: + raise RuntimeError( + "device serial is required if not compile only. " + "Please specify a device serial by -s/--device argument." + ) + + data_num = 100 + if args.ci: + inputs = [(torch.rand(1, 3, 224, 224),)] + logging.warning( + "This option is for CI to verify the export flow. It uses random input and will result in poor accuracy." + ) + else: + inputs, targets, input_list = get_imagenet_dataset( + dataset_path=f"{args.dataset}", + data_size=data_num, + image_shape=(256, 256), + crop_size=224, + ) + + module = ( + AutoModelForImageClassification.from_pretrained("google/efficientnet-b0") + .eval() + .to("cpu") + ) + pte_filename = "efficientnet_qnn_q16" + build_executorch_binary( + module.eval(), + inputs[0], + args.model, + f"{args.artifact}/{pte_filename}", + inputs, + skip_node_id_set=skip_node_id_set, + skip_node_op_set=skip_node_op_set, + quant_dtype=QuantDtype.use_16a16w, + shared_buffer=args.shared_buffer, + ) + + if args.compile_only: + return + + adb = SimpleADB( + qnn_sdk=os.getenv("QNN_SDK_ROOT"), + build_path=f"{args.build_folder}", + pte_path=f"{args.artifact}/{pte_filename}.pte", + workspace=f"/data/local/tmp/executorch/{pte_filename}", + device_id=args.device, + host_id=args.host, + soc_model=args.model, + shared_buffer=args.shared_buffer, + ) + adb.push(inputs=inputs, input_list=input_list) + adb.execute() + + # collect output data + output_data_folder = f"{args.artifact}/outputs" + make_output_dir(output_data_folder) + + adb.pull(output_path=args.artifact) + + # top-k analysis + predictions = [] + for i in range(data_num): + predictions.append( + np.fromfile( + os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32 + ) + ) + + k_val = [1, 5] + topk = [topk_accuracy(predictions, targets, k).item() for k in k_val] + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({f"top_{k}": topk[i] for i, k in enumerate(k_val)})) + else: + for i, k in enumerate(k_val): + print(f"top_{k}->{topk[i]}%") + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + + parser.add_argument( + "-d", + "--dataset", + help=( + "path to the validation folder of ImageNet dataset. " + "e.g. --dataset imagenet-mini/val " + "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)" + ), + type=str, + required=False, + ) + + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. " + "Default ./efficientnet", + default="./efficientnet", + type=str, + ) + + args = parser.parse_args() + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp index d348878294a..bdc2019352e 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp @@ -152,8 +152,10 @@ Error Runner::load() { // Use attention mask length to retrieve AR length and context length // Cache len equals to context_len - ar_len - int32_t prompt_processor_ar_len, token_generator_ar_len, max_cache_len, - max_ar_len; + int32_t prompt_processor_ar_len = 0; + int32_t token_generator_ar_len = 0; + int32_t max_cache_len = 0; + int32_t max_ar_len = 0; // atten mask: [1, AR-N, CL] auto atten_mask_meta_token = method_meta->input_tensor_meta(1); token_generator_ar_len = atten_mask_meta_token->sizes()[1]; diff --git a/examples/xnnpack/README.md b/examples/xnnpack/README.md index 5c307d34717..6fe1f0488b2 100644 --- a/examples/xnnpack/README.md +++ b/examples/xnnpack/README.md @@ -24,7 +24,7 @@ The following command will produce a floating-point XNNPACK delegated model `mv2 python3 -m examples.xnnpack.aot_compiler --model_name="mv2" --delegate ``` -Once we have the model binary (pte) file, then let's run it with ExecuTorch runtime using the `xnn_executor_runner`. With cmake, you first configure your cmake with the following: +Once we have the model binary (pte) file, then let's run it with ExecuTorch runtime using the `executor_runner`. With cmake, you first configure your cmake with the following: ```bash # cd to the root of executorch repo @@ -56,7 +56,7 @@ cmake --build cmake-out -j9 --target install --config Release Now finally you should be able to run this model with the following command ```bash -./cmake-out/backends/xnnpack/xnn_executor_runner --model_path ./mv2_xnnpack_fp32.pte +./cmake-out/executor_runner --model_path ./mv2_xnnpack_fp32.pte ``` ## Quantization @@ -80,7 +80,7 @@ python3 -m examples.xnnpack.quantization.example --help ``` ## Running the XNNPACK Model with CMake -After exporting the XNNPACK Delegated model, we can now try running it with example inputs using CMake. We can build and use the xnn_executor_runner, which is a sample wrapper for the ExecuTorch Runtime and XNNPACK Backend. We first begin by configuring the CMake build like such: +After exporting the XNNPACK Delegated model, we can now try running it with example inputs using CMake. We can build and use the executor_runner, which is a sample wrapper for the ExecuTorch Runtime. The XNNPACK Backend is enabled via the compilation flag `-DEXECUTORCH_BUILD_XNNPACK=ON`. We first begin by configuring the CMake build like such: ```bash # cd to the root of executorch repo cd executorch @@ -107,9 +107,9 @@ Then you can build the runtime componenets with cmake --build cmake-out -j9 --target install --config Release ``` -Now you should be able to find the executable built at `./cmake-out/backends/xnnpack/xnn_executor_runner` you can run the executable with the model you generated as such +Now you should be able to find the executable built at `./cmake-out/executor_runner` you can run the executable with the model you generated as such ```bash -./cmake-out/backends/xnnpack/xnn_executor_runner --model_path=./mv2_quantized.pte +./cmake-out/executor_runner --model_path=./mv2_quantized.pte ``` ## Delegating a Quantized Model diff --git a/exir/passes/constant_prop_pass.py b/exir/passes/constant_prop_pass.py index a103568b9a9..fc93aa1b0ca 100644 --- a/exir/passes/constant_prop_pass.py +++ b/exir/passes/constant_prop_pass.py @@ -295,6 +295,37 @@ def create_constant_nodes_and_return_specs( return name_to_spec_dict +def _update_output_node_and_specs(exported_program: ExportedProgram) -> None: + """ + Update the output node and output specs in the exported program. + In case a constant node is used as output, we replace it with a clone of the constant node. + """ + # Dict [node.name -> InputSpec] + updated_constant_placeholders = get_constant_placeholder_dict(exported_program) + output = exported_program.graph.find_nodes(op="output")[0] + output_nodes = cast(list[torch.fx.Node], list(output.args[0])) + output_specs = exported_program.graph_signature.output_specs + assert len(output_nodes) == len(output_specs) + + for i in range(len(output_specs)): + out_node = output_nodes[i] + if out_node not in updated_constant_placeholders: + continue + + with exported_program.graph.inserting_after(out_node): + new_node = exported_program.graph.call_function( + exir_ops.edge.aten.clone.default, (out_node,) + ) + assert "val" in out_node.meta + new_node.meta["val"] = out_node.meta["val"] + output_nodes[i] = new_node + + # Update the constant-propagated output node. + output_specs[i].arg = TensorArgument(name=output_nodes[i].name) + + output.args = (output_nodes,) + + def constant_prop_pass( exported_program: ExportedProgram, custom_skip_targets: Optional[set[EdgeOpOverload]] = None, @@ -341,12 +372,12 @@ def constant_prop_pass( # Generate new input spec. new_input_specs = [] - for node in exported_program.graph.nodes: - if node.op != "placeholder": - continue + for node in exported_program.graph.find_nodes(op="placeholder"): new_input_specs.append(name_to_spec_dict[node.name]) exported_program.graph_signature.input_specs = new_input_specs + _update_output_node_and_specs(exported_program) + # Cleanup the graph. exported_program.graph.eliminate_dead_code() exported_program.graph_module.recompile() diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py index da072b0f838..ca2b5ebdc35 100644 --- a/exir/tests/test_passes.py +++ b/exir/tests/test_passes.py @@ -1026,6 +1026,34 @@ def forward(self, x): "executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor" ).run(gm.code) + def test_constant_prop_for_output(self) -> None: + class Add(torch.nn.Module): + def forward(self) -> torch.Tensor: + return torch.add(torch.tensor(3), torch.tensor(5)) + + add = Add() + + edge = to_edge( + export(add, (), strict=True), + compile_config=EdgeCompileConfig(_skip_dim_order=False), + ) + # Check there is a lifted tensor followed by a to_copy node + FileCheck().check("c_lifted_tensor_0").check("c_lifted_tensor_1").run( + edge.exported_program().graph_module.code + ) + + edge._edge_programs["forward"] = constant_prop_pass( + edge.exported_program("forward") + ) + + # Check (c_lifted_tensor_*) nodes are all replaced by _prop_tensor_constant. + FileCheck().check_not("c_lifted_tensor_").check("_prop_tensor_constant").run( + edge.exported_program().graph_module.code + ) + # Validate that the program successfully passes validation to executorch: + edge.exported_program()._validate() + edge.to_executorch() + def test_constant_prop_pass_for_add(self) -> None: class Add(torch.nn.Module): def forward(self, x: torch.Tensor) -> torch.Tensor: diff --git a/export/TARGETS b/export/TARGETS index ae8be8a5e98..bf1002a701e 100644 --- a/export/TARGETS +++ b/export/TARGETS @@ -12,6 +12,7 @@ python_library( "//executorch/exir/backend:backend_api", "//executorch/exir:pass_manager", "//executorch/devtools/backend_debug:delegation_info", + "//executorch/extension/export_util:export_util", ] ) diff --git a/export/export.py b/export/export.py index 593f9b91157..7dd6b239d0a 100644 --- a/export/export.py +++ b/export/export.py @@ -4,16 +4,19 @@ import torch from executorch.devtools.backend_debug import get_delegation_info from executorch.exir._warnings import experimental +from executorch.exir.backend.backend_api import validation_disabled from executorch.exir.program import ( EdgeProgramManager, ExecutorchProgramManager, to_edge_transform_and_lower, ) from executorch.exir.schema import Program +from executorch.extension.export_util.utils import save_pte_program from executorch.runtime import Runtime, Verification from tabulate import tabulate from torch import nn from torch.ao.quantization import allow_exported_model_train_eval +from torch.ao.quantization.quantizer.composable_quantizer import ComposableQuantizer from torch.export import ExportedProgram from torchao.quantization import quantize_ from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e @@ -145,15 +148,15 @@ def run( model, self._example_inputs_dict[method_name][0], dynamic_shapes=dynamic_shapes, + strict=True, ) # Apply pre-edge transform passes if available if self._pre_edge_transform_passes is not None: - self._exported_program[method_name] = ( - self._pre_edge_transform_passes( + for pre_edge_transform_pass in self._pre_edge_transform_passes: + self._exported_program[method_name] = pre_edge_transform_pass( self._exported_program[method_name] ) - ) def get_artifacts(self) -> Dict[str, ExportedProgram]: """ @@ -210,13 +213,14 @@ def run( self._constant_methods = transform_config.get("constant_methods", None) # Process inputs - self._edge_program_manager = to_edge_transform_and_lower( - self._exported_program, - partitioner=self._partitioners, - transform_passes=self._transform_passes, - constant_methods=self._constant_methods, - compile_config=self._compile_config, - ) + with validation_disabled(): + self._edge_program_manager = to_edge_transform_and_lower( + self._exported_program, + partitioner=self._partitioners, + transform_passes=self._transform_passes, + constant_methods=self._constant_methods, + compile_config=self._compile_config, + ) self._delegation_info = get_delegation_info( self._edge_program_manager.exported_program().graph_module ) @@ -345,8 +349,8 @@ class QuantizeStage(Stage): Optional stage: Perform post-training quantization on the model. """ - def __init__(self, quantizer: Any) -> None: - self._quantizer = quantizer + def __init__(self, quantizers: Any) -> None: + self._quantizers = quantizers self._quantized_models: Dict[str, nn.Module] = {} self._model_dict: Dict[str, nn.Module] = {} self._exported_program_dict: Dict[str, ExportedProgram] = {} @@ -394,7 +398,8 @@ def run( model = exported_program.module() # Prepare the model for quantization - prepared_model = prepare_pt2e(model, self._quantizer) # type: ignore + composed_quantizer = ComposableQuantizer(self._quantizers) + prepared_model = prepare_pt2e(model, composed_quantizer) # type: ignore # Allow the model to switch between train and eval modes allow_exported_model_train_eval(prepared_model) @@ -546,9 +551,9 @@ def __init__( # Create the quantize stage if a quantizer is provided if self._export_recipe.quantization_recipe is not None: - quantizer = self._export_recipe.quantization_recipe.get_quantizer() - if quantizer is not None: - quantize_stage = QuantizeStage(quantizer=quantizer) + quantizers = self._export_recipe.quantization_recipe.get_quantizers() + if quantizers is not None: + quantize_stage = QuantizeStage(quantizers=quantizers) self._pipeline.append(quantize_stage) # Create the edge transform and lower stage @@ -661,6 +666,22 @@ def get_executorch_program(self) -> Program: ) return self._executorch_program_manager.executorch_program + def get_executorch_program_manager(self) -> ExecutorchProgramManager: + """ + Get the ExecutorchProgramManager. + + Returns: + The ExecutorchProgramManager + + Raises: + RuntimeError: If the executorch program manager is not initialized + """ + if self._executorch_program_manager is None: + raise RuntimeError( + "Executorch program manager is not initialized. Run export() first." + ) + return self._executorch_program_manager + def get_pte_buffer(self) -> bytes: """ Get the PTE buffer as bytes. @@ -677,6 +698,20 @@ def get_pte_buffer(self) -> bytes: ) return self._executorch_program_manager.buffer + def save_to_pte(self, output_name: str) -> None: + """ + Save the model to a .pte file. + + Args: + output_name (Optional[str]): The name of the .pte file. + """ + assert output_name, "Need a valid output name" + if self._executorch_program_manager is None: + raise RuntimeError( + "Executorch program manager is not initialized. Run export() first." + ) + save_pte_program(self._executorch_program_manager, output_name) + def get_example_input( self, method_name: str = "forward" ) -> Tuple[torch.Tensor, ...]: diff --git a/export/recipe.py b/export/recipe.py index 7b743c0aa4c..b993fce26e3 100644 --- a/export/recipe.py +++ b/export/recipe.py @@ -49,17 +49,17 @@ class QuantizationRecipe: quantizer: Optional quantizer for model quantization """ - quantizer: Optional[Quantizer] = None + quantizers: Optional[List[Quantizer]] = None ao_base_config: Optional[List[AOBaseConfig]] = None - def get_quantizer(self) -> Optional[Quantizer]: + def get_quantizers(self) -> Optional[Quantizer]: """ Get the quantizer associated with this recipe. Returns: The quantizer if one is set, otherwise None """ - return self.quantizer + return self.quantizers @experimental( @@ -94,10 +94,11 @@ class ExportRecipe: ) pre_edge_transform_passes: Optional[ Callable[[ExportedProgram], ExportedProgram] + | List[Callable[[ExportedProgram], ExportedProgram]] ] = None edge_transform_passes: Optional[Sequence[PassType]] = None transform_check_ir_validity: bool = True - partitioners: Optional[list[Partitioner]] = None + partitioners: Optional[List[Partitioner]] = None executorch_backend_config: Optional[ExecutorchBackendConfig] = ( None # pyre-ignore[11]: Type not defined ) diff --git a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmModuleInstrumentationTest.kt b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmModuleInstrumentationTest.kt index 43ce302a7a6..2df45f14985 100644 --- a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmModuleInstrumentationTest.kt +++ b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmModuleInstrumentationTest.kt @@ -18,10 +18,14 @@ import org.apache.commons.io.FileUtils import org.json.JSONException import org.json.JSONObject import org.junit.Assert +import org.junit.Assert.assertEquals +import org.junit.Assert.assertThat +import org.junit.Assert.assertTrue import org.junit.Before import org.junit.Rule import org.junit.Test import org.junit.runner.RunWith +import org.pytorch.executorch.TestFileUtils.getTestFilePath import org.pytorch.executorch.extension.llm.LlmCallback import org.pytorch.executorch.extension.llm.LlmModule @@ -30,7 +34,7 @@ import org.pytorch.executorch.extension.llm.LlmModule class LlmModuleInstrumentationTest : LlmCallback { private val results: MutableList = ArrayList() private val tokensPerSecond: MutableList = ArrayList() - private var llmModule: LlmModule? = null + private lateinit var llmModule: LlmModule @Before @Throws(IOException::class) @@ -57,25 +61,25 @@ class LlmModuleInstrumentationTest : LlmCallback { @Test @Throws(IOException::class, URISyntaxException::class) fun testGenerate() { - val loadResult = llmModule!!.load() + val loadResult = llmModule.load() // Check that the model can be load successfully - Assert.assertEquals(OK.toLong(), loadResult.toLong()) + assertEquals(OK.toLong(), loadResult.toLong()) - llmModule!!.generate(TEST_PROMPT, SEQ_LEN, this@LlmModuleInstrumentationTest) - Assert.assertEquals(results.size.toLong(), SEQ_LEN.toLong()) - Assert.assertTrue(tokensPerSecond[tokensPerSecond.size - 1] > 0) + llmModule.generate(TEST_PROMPT, SEQ_LEN, this@LlmModuleInstrumentationTest) + assertEquals(results.size.toLong(), SEQ_LEN.toLong()) + assertTrue(tokensPerSecond[tokensPerSecond.size - 1] > 0) } @Test @Throws(IOException::class, URISyntaxException::class) fun testGenerateAndStop() { - llmModule!!.generate( + llmModule.generate( TEST_PROMPT, SEQ_LEN, object : LlmCallback { override fun onResult(result: String) { this@LlmModuleInstrumentationTest.onResult(result) - llmModule!!.stop() + llmModule.stop() } override fun onStats(stats: String) { @@ -85,7 +89,7 @@ class LlmModuleInstrumentationTest : LlmCallback { ) val stoppedResultSize = results.size - Assert.assertTrue(stoppedResultSize < SEQ_LEN) + assertTrue(stoppedResultSize < SEQ_LEN) } override fun onResult(result: String) { @@ -101,7 +105,8 @@ class LlmModuleInstrumentationTest : LlmCallback { val promptEvalEndMs = jsonObject.getInt("prompt_eval_end_ms") tps = numGeneratedTokens.toFloat() / (inferenceEndMs - promptEvalEndMs) * 1000 tokensPerSecond.add(tps) - } catch (_: JSONException) {} + } catch (_: JSONException) { + } } companion object { @@ -110,12 +115,5 @@ class LlmModuleInstrumentationTest : LlmCallback { private const val TEST_PROMPT = "Hello" private const val OK = 0x00 private const val SEQ_LEN = 32 - - private fun getTestFilePath(fileName: String): String { - return InstrumentationRegistry.getInstrumentation() - .targetContext - .externalCacheDir - .toString() + fileName - } } } diff --git a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleE2ETest.kt b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleE2ETest.kt index 2a1e9d4c8ff..e269f4aa38f 100644 --- a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleE2ETest.kt +++ b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleE2ETest.kt @@ -22,6 +22,7 @@ import org.junit.Rule import org.junit.Test import org.junit.runner.RunWith import org.pytorch.executorch.TensorImageUtils.bitmapToFloat32Tensor +import org.pytorch.executorch.TestFileUtils.getTestFilePath /** Unit tests for [Module]. */ @RunWith(AndroidJUnit4::class) @@ -90,12 +91,6 @@ class ModuleE2ETest { } companion object { - private fun getTestFilePath(fileName: String): String { - return InstrumentationRegistry.getInstrumentation() - .targetContext - .externalCacheDir - .toString() + fileName - } fun argmax(array: FloatArray): Int { require(array.isNotEmpty()) { "Array cannot be empty" } diff --git a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleInstrumentationTest.kt b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleInstrumentationTest.kt index 1885660d0a1..58e9cc8bfef 100644 --- a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleInstrumentationTest.kt +++ b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleInstrumentationTest.kt @@ -23,6 +23,7 @@ import org.junit.Before import org.junit.Rule import org.junit.Test import org.junit.runner.RunWith +import org.pytorch.executorch.TestFileUtils.getTestFilePath /** Unit tests for [Module]. */ @RunWith(AndroidJUnit4::class) @@ -173,12 +174,5 @@ class ModuleInstrumentationTest { private const val INVALID_STATE = 0x2 private const val INVALID_ARGUMENT = 0x12 private const val ACCESS_FAILED = 0x22 - - private fun getTestFilePath(fileName: String): String { - return InstrumentationRegistry.getInstrumentation() - .targetContext - .externalCacheDir - .toString() + fileName - } } } diff --git a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/TestFileUtils.kt b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/TestFileUtils.kt new file mode 100644 index 00000000000..efa364f8e94 --- /dev/null +++ b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/TestFileUtils.kt @@ -0,0 +1,16 @@ +package org.pytorch.executorch + +import androidx.test.InstrumentationRegistry + +/** + * Test File Utils + */ +object TestFileUtils { + + fun getTestFilePath(fileName: String): String { + return InstrumentationRegistry.getInstrumentation() + .targetContext + .externalCacheDir + .toString() + fileName + } +} diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorch+Tensor.swift b/extension/apple/ExecuTorch/Exported/ExecuTorch+Tensor.swift index 29af8f78a5a..b325000ed23 100644 --- a/extension/apple/ExecuTorch/Exported/ExecuTorch+Tensor.swift +++ b/extension/apple/ExecuTorch/Exported/ExecuTorch+Tensor.swift @@ -54,7 +54,7 @@ public extension Tensor { func withUnsafeBytes(_ body: (UnsafeBufferPointer) throws -> R) throws -> R { guard dataType == T.dataType else { throw Error(code: .invalidArgument) } var result: Result? - bytes { pointer, count, _ in + __bytes { pointer, count, _ in result = Result { try body( UnsafeBufferPointer( start: pointer.assumingMemoryBound(to: T.self), @@ -74,7 +74,7 @@ public extension Tensor { func withUnsafeMutableBytes(_ body: (UnsafeMutableBufferPointer) throws -> R) throws -> R { guard dataType == T.dataType else { throw Error(code: .invalidArgument) } var result: Result? - mutableBytes { pointer, count, _ in + __mutableBytes { pointer, count, _ in result = Result { try body( UnsafeMutableBufferPointer( start: pointer.assumingMemoryBound(to: T.self), diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.h b/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.h index e832845d6ba..5b130da56c9 100644 --- a/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.h +++ b/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.h @@ -176,7 +176,7 @@ __attribute__((deprecated("This API is experimental."))) * - and the data type. */ - (void)bytesWithHandler:(NS_NOESCAPE void (^)(const void *pointer, NSInteger count, ExecuTorchDataType dataType))handler - NS_SWIFT_NAME(bytes(_:)); + NS_REFINED_FOR_SWIFT; /** * Executes a block with a pointer to the tensor's mutable byte data. @@ -187,7 +187,7 @@ __attribute__((deprecated("This API is experimental."))) * - and the data type. */ - (void)mutableBytesWithHandler:(NS_NOESCAPE void (^)(void *pointer, NSInteger count, ExecuTorchDataType dataType))handler - NS_SWIFT_NAME(mutableBytes(_:)); + NS_REFINED_FOR_SWIFT; /** * Resizes the tensor to a new shape. diff --git a/extension/apple/ExecuTorch/__tests__/TensorTest.swift b/extension/apple/ExecuTorch/__tests__/TensorTest.swift index 689a514403f..052b84ae5f8 100644 --- a/extension/apple/ExecuTorch/__tests__/TensorTest.swift +++ b/extension/apple/ExecuTorch/__tests__/TensorTest.swift @@ -68,13 +68,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.dimensionOrder, [0, 1]) XCTAssertEqual(tensor.shapeDynamism, .dynamicBound) XCTAssertEqual(tensor.count, 6) - - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(dataType, .float) - XCTAssertEqual(count, 6) - XCTAssertEqual(size(ofDataType: dataType), 4) - XCTAssertEqual(Array(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Float.self), count: count)), data) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { buffer in + XCTAssertEqual(Array(buffer), data) + }) } func testInitBytes() { @@ -91,13 +87,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.dimensionOrder, [0, 1]) XCTAssertEqual(tensor.shapeDynamism, .dynamicBound) XCTAssertEqual(tensor.count, 6) - - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(dataType, .double) - XCTAssertEqual(count, 6) - XCTAssertEqual(size(ofDataType: dataType), 8) - XCTAssertEqual(Array(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Double.self), count: count)).map { $0 + 1 }, data) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { buffer in + XCTAssertEqual(Array(buffer).map { $0 + 1 }, data) + }) } func testInitData() { @@ -105,9 +97,9 @@ class TensorTest: XCTestCase { let data = Data(bytes: dataArray, count: dataArray.count * MemoryLayout.size) let tensor = Tensor(data: data, shape: [4], dataType: .float) XCTAssertEqual(tensor.count, 4) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(Array(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Float.self), count: count)), dataArray) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { buffer in + XCTAssertEqual(Array(buffer), dataArray) + }) } func testWithCustomStridesAndDimensionOrder() { @@ -123,10 +115,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, [1, 2]) XCTAssertEqual(tensor.dimensionOrder, [1, 0]) XCTAssertEqual(tensor.count, 4) - - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(Array(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Float.self), count: count)), data) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { buffer in + XCTAssertEqual(Array(buffer), data) + }) } func testMutableBytes() { @@ -134,41 +125,14 @@ class TensorTest: XCTestCase { let tensor = data.withUnsafeMutableBytes { Tensor(bytes: $0.baseAddress!, shape: [4], dataType: .int) } - tensor.mutableBytes { pointer, count, dataType in - XCTAssertEqual(dataType, .int) - let buffer = pointer.assumingMemoryBound(to: Int32.self) - for i in 0..) in + XCTAssertNoThrow(try tensor.withUnsafeMutableBytes { (buffer: UnsafeMutableBufferPointer) in for i in buffer.indices { buffer[i] *= 2 } - } - try tensor.withUnsafeBytes { buffer in + }) + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in XCTAssertEqual(Array(buffer), [2, 4, 6, 8]) - } + }) } func testInitWithTensor() { @@ -202,18 +166,17 @@ class TensorTest: XCTestCase { func testResize() { var data: [Int] = [1, 2, 3, 4] let tensor = data.withUnsafeMutableBytes { - Tensor(bytesNoCopy: $0.baseAddress!, shape: [4, 1], dataType: .int) + Tensor(bytesNoCopy: $0.baseAddress!, shape: [4, 1], dataType: .long) } XCTAssertNoThrow(try tensor.resize(to: [2, 2])) - XCTAssertEqual(tensor.dataType, .int) + XCTAssertEqual(tensor.dataType, .long) XCTAssertEqual(tensor.shape, [2, 2]) XCTAssertEqual(tensor.strides, [2, 1]) XCTAssertEqual(tensor.dimensionOrder, [0, 1]) XCTAssertEqual(tensor.count, 4) - - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(Array(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Int.self), count: count)), data) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { buffer in + XCTAssertEqual(Array(buffer), data) + }) } func testResizeError() { @@ -255,9 +218,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, [1]) XCTAssertEqual(tensor.dimensionOrder, [0]) XCTAssertEqual(tensor.count, 6) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(Array(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: UInt8.self), count: count)), data) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { buffer in + XCTAssertEqual(Array(buffer), data) + }) } func testInitScalarsInt8() { @@ -268,9 +231,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, [1]) XCTAssertEqual(tensor.dimensionOrder, [0]) XCTAssertEqual(tensor.count, 6) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(Array(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Int8.self), count: count)), data) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { buffer in + XCTAssertEqual(Array(buffer), data) + }) } func testInitScalarsInt16() { @@ -281,9 +244,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, [1]) XCTAssertEqual(tensor.dimensionOrder, [0]) XCTAssertEqual(tensor.count, 6) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(Array(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Int16.self), count: count)), data) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { buffer in + XCTAssertEqual(Array(buffer), data) + }) } func testInitScalarsInt32() { @@ -294,9 +257,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, [1]) XCTAssertEqual(tensor.dimensionOrder, [0]) XCTAssertEqual(tensor.count, 6) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(Array(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Int32.self), count: count)), data) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { buffer in + XCTAssertEqual(Array(buffer), data) + }) } func testInitScalarsInt64() { @@ -307,9 +270,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, [1]) XCTAssertEqual(tensor.dimensionOrder, [0]) XCTAssertEqual(tensor.count, 6) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(Array(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Int64.self), count: count)), data) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { buffer in + XCTAssertEqual(Array(buffer), data) + }) } func testInitScalarsFloat() { @@ -320,9 +283,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, [1]) XCTAssertEqual(tensor.dimensionOrder, [0]) XCTAssertEqual(tensor.count, 6) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(Array(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Float.self), count: count)), data) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { buffer in + XCTAssertEqual(Array(buffer), data) + }) } func testInitScalarsDouble() { @@ -333,9 +296,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, [1]) XCTAssertEqual(tensor.dimensionOrder, [0]) XCTAssertEqual(tensor.count, 6) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(Array(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Double.self), count: count)), data) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { buffer in + XCTAssertEqual(Array(buffer), data) + }) } func testInitScalarsBool() { @@ -346,9 +309,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, [1]) XCTAssertEqual(tensor.dimensionOrder, [0]) XCTAssertEqual(tensor.count, 6) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(Array(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Bool.self), count: count)), data) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { buffer in + XCTAssertEqual(Array(buffer), data) + }) } func testInitScalarsUInt16() { @@ -359,9 +322,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, [1]) XCTAssertEqual(tensor.dimensionOrder, [0]) XCTAssertEqual(tensor.count, 6) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(Array(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: UInt16.self), count: count)), data) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { buffer in + XCTAssertEqual(Array(buffer), data) + }) } func testInitScalarsUInt32() { @@ -372,9 +335,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, [1]) XCTAssertEqual(tensor.dimensionOrder, [0]) XCTAssertEqual(tensor.count, 6) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(Array(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: UInt32.self), count: count)), data) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { buffer in + XCTAssertEqual(Array(buffer), data) + }) } func testInitScalarsUInt64() { @@ -385,9 +348,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, [1]) XCTAssertEqual(tensor.dimensionOrder, [0]) XCTAssertEqual(tensor.count, 6) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(Array(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: UInt64.self), count: count)), data) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { buffer in + XCTAssertEqual(Array(buffer), data) + }) } func testInitScalarsInt() { @@ -398,9 +361,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, [1]) XCTAssertEqual(tensor.dimensionOrder, [0]) XCTAssertEqual(tensor.count, 6) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(Array(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Int.self), count: count)), data) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { buffer in + XCTAssertEqual(Array(buffer), data) + }) } func testInitScalarsUInt() { @@ -411,9 +374,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, [1]) XCTAssertEqual(tensor.dimensionOrder, [0]) XCTAssertEqual(tensor.count, 6) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(Array(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: UInt.self), count: count)), data) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { buffer in + XCTAssertEqual(Array(buffer), data) + }) } func testInitInt8() { @@ -423,9 +386,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, []) XCTAssertEqual(tensor.dimensionOrder, []) XCTAssertEqual(tensor.count, 1) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Int8.self), count: count).first, 42) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in + XCTAssertEqual(Array(buffer).first, 42) + }) } func testInitInt16() { @@ -435,9 +398,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, []) XCTAssertEqual(tensor.dimensionOrder, []) XCTAssertEqual(tensor.count, 1) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Int16.self), count: count).first, 42) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in + XCTAssertEqual(Array(buffer).first, 42) + }) } func testInitInt32() { @@ -447,9 +410,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, []) XCTAssertEqual(tensor.dimensionOrder, []) XCTAssertEqual(tensor.count, 1) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Int32.self), count: count).first, 42) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in + XCTAssertEqual(Array(buffer).first, 42) + }) } func testInitInt64() { @@ -459,9 +422,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, []) XCTAssertEqual(tensor.dimensionOrder, []) XCTAssertEqual(tensor.count, 1) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Int64.self), count: count).first, 42) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in + XCTAssertEqual(Array(buffer).first, 42) + }) } func testInitUInt8() { @@ -471,9 +434,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, []) XCTAssertEqual(tensor.dimensionOrder, []) XCTAssertEqual(tensor.count, 1) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: UInt8.self), count: count).first, 42) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in + XCTAssertEqual(Array(buffer).first, 42) + }) } func testInitUInt16() { @@ -483,9 +446,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, []) XCTAssertEqual(tensor.dimensionOrder, []) XCTAssertEqual(tensor.count, 1) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: UInt16.self), count: count).first, 42) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in + XCTAssertEqual(Array(buffer).first, 42) + }) } func testInitUInt32() { @@ -495,9 +458,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, []) XCTAssertEqual(tensor.dimensionOrder, []) XCTAssertEqual(tensor.count, 1) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: UInt32.self), count: count).first, 42) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in + XCTAssertEqual(Array(buffer).first, 42) + }) } func testInitUInt64() { @@ -507,9 +470,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, []) XCTAssertEqual(tensor.dimensionOrder, []) XCTAssertEqual(tensor.count, 1) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: UInt64.self), count: count).first, 42) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in + XCTAssertEqual(Array(buffer).first, 42) + }) } func testInitBool() { @@ -519,9 +482,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, []) XCTAssertEqual(tensor.dimensionOrder, []) XCTAssertEqual(tensor.count, 1) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Bool.self), count: count).first, true) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in + XCTAssertEqual(Array(buffer).first, true) + }) } func testInitFloat() { @@ -531,9 +494,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, []) XCTAssertEqual(tensor.dimensionOrder, []) XCTAssertEqual(tensor.count, 1) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Float.self), count: count).first, 42.0) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in + XCTAssertEqual(Array(buffer).first, 42) + }) } func testInitDouble() { @@ -543,9 +506,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, []) XCTAssertEqual(tensor.dimensionOrder, []) XCTAssertEqual(tensor.count, 1) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Double.self), count: count).first, 42.0) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in + XCTAssertEqual(Array(buffer).first, 42.0) + }) } func testInitInt() { @@ -555,9 +518,9 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, []) XCTAssertEqual(tensor.dimensionOrder, []) XCTAssertEqual(tensor.count, 1) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Int.self), count: count).first, 42) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { buffer in + XCTAssertEqual(Array(buffer).first, 42) + }) } func testInitUInt() { @@ -567,20 +530,20 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.strides, []) XCTAssertEqual(tensor.dimensionOrder, []) XCTAssertEqual(tensor.count, 1) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: UInt.self), count: count).first, 42) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in + XCTAssertEqual(Array(buffer).first, 42) + }) } func testEmpty() { let tensor = Tensor.empty(shape: [3, 4], dataType: .float) XCTAssertEqual(tensor.shape, [3, 4]) XCTAssertEqual(tensor.count, 12) - tensor.bytes { pointer, count, dataType in - XCTAssertNotNil(pointer) - XCTAssertEqual(count, 12) - XCTAssertEqual(dataType, .float) - } + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in + XCTAssertNotNil(buffer.baseAddress) + XCTAssertEqual(buffer.count, 12) + XCTAssertEqual(tensor.dataType, .float) + }) } func testEmptyLike() { @@ -596,87 +559,76 @@ class TensorTest: XCTestCase { let tensor = Tensor.full(shape: [2, 2], scalar: 7, dataType: .int) XCTAssertEqual(tensor.shape, [2, 2]) XCTAssertEqual(tensor.count, 4) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(dataType, .int) - let buffer = UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Int32.self), count: count) + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in for value in buffer { XCTAssertEqual(value, 7) } - } + }) } func testFullLike() { let other = Tensor.empty(shape: [2, 2], dataType: .int) let tensor = Tensor.full(like: other, scalar: 42, dataType: .float) XCTAssertEqual(tensor.shape, other.shape) - tensor.bytes { pointer, count, dataType in - let buffer = UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Float.self), count: count) + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in for value in buffer { XCTAssertEqual(value, 42.0) } - } + }) } func testOnes() { let tensor = Tensor.ones(shape: [2, 3], dataType: .float) XCTAssertEqual(tensor.shape, [2, 3]) XCTAssertEqual(tensor.count, 6) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(dataType, .float) - let buffer = UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Float.self), count: count) + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in for value in buffer { XCTAssertEqual(value, 1.0) } - } + }) } func testOnesLike() { let other = Tensor.empty(shape: [2, 4], dataType: .double) let tensor = Tensor.ones(like: other) XCTAssertEqual(tensor.shape, other.shape) - tensor.bytes { pointer, count, dataType in - let buffer = UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Double.self), count: count) + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in for value in buffer { XCTAssertEqual(value, 1.0) } - } + }) } func testZeros() { let tensor = Tensor.zeros(shape: [2, 3], dataType: .double) XCTAssertEqual(tensor.shape, [2, 3]) XCTAssertEqual(tensor.count, 6) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(dataType, .double) - let buffer = UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Double.self), count: count) + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in for value in buffer { XCTAssertEqual(value, 0) } - } + }) } func testZerosLike() { let other = Tensor.full(shape: [3, 2], scalar: 9, dataType: .int) let tensor = Tensor.zeros(like: other) XCTAssertEqual(tensor.shape, other.shape) - tensor.bytes { pointer, count, dataType in - let buffer = UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Int32.self), count: count) + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in for value in buffer { XCTAssertEqual(value, 0) } - } + }) } func testRandom() { let tensor = Tensor.rand(shape: [3, 3], dataType: .float) XCTAssertEqual(tensor.shape, [3, 3]) XCTAssertEqual(tensor.count, 9) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(dataType, .float) - let buffer = UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Float.self), count: count) - let uniqueValues = Set(buffer.map { $0 }) + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in + let uniqueValues = Set(buffer) XCTAssertTrue(uniqueValues.count > 1) - } + }) } func testRandomLike() { @@ -686,15 +638,13 @@ class TensorTest: XCTestCase { XCTAssertEqual(tensor.count, other.count) } - func testRandomNormal() { + func testRandomNormal() { let tensor = Tensor.randn(shape: [4], dataType: .double) XCTAssertEqual(tensor.shape, [4]) XCTAssertEqual(tensor.count, 4) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(dataType, .double) - let buffer = UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Double.self), count: count) + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in XCTAssertEqual(buffer.count, 4) - } + }) } func testRandomNormalLike() { @@ -708,23 +658,20 @@ class TensorTest: XCTestCase { let tensor = Tensor.randint(low: 10, high: 20, shape: [5], dataType: .int) XCTAssertEqual(tensor.shape, [5]) XCTAssertEqual(tensor.count, 5) - tensor.bytes { pointer, count, dataType in - XCTAssertEqual(dataType, .int) - let buffer = UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Int32.self), count: count) + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in for value in buffer { XCTAssertTrue(value >= 10 && value < 20) } - } + }) } func testRandomIntegerLike() { let other = Tensor.ones(shape: [5], dataType: .int) let tensor = Tensor.randint(like: other, low: 100, high: 200) - tensor.bytes { pointer, count, dataType in - let buffer = UnsafeBufferPointer(start: pointer.assumingMemoryBound(to: Int32.self), count: count) + XCTAssertNoThrow(try tensor.withUnsafeBytes { (buffer: UnsafeBufferPointer) in for value in buffer { XCTAssertTrue(value >= 100 && value < 200) } - } + }) } } diff --git a/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj b/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj index 355227eef63..b0cddfa808c 100644 --- a/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj +++ b/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj @@ -392,7 +392,7 @@ ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; - shellScript = "set -e\n\nif ! command -v cmake &> /dev/null\nthen\n echo \"Cmake not found, please install Cmake. \\n1. Download Cmake.app from https://cmake.org/download with version > 3.19. \\n2. Install it to Applications/ folder and run sudo /Applications/CMake.app/Contents/bin/cmake-gui --install to install CMake commandline tools.\"\n exit 1\nfi\n\nCMAKE_DIR=\"$TEMP_DIR/cmake\"\nrm -rf \"$CMAKE_DIR\"\n\nPLATFORM=\"SIMULATORARM64\"\nDEPLOYMENT_TARGET=\"17.0\"\n\nif [[ \"$PLATFORM_NAME\" == *\"iphoneos\"* ]]; then\n PLATFORM=\"OS64\"\nelif [[ \"$PLATFORM_NAME\" == *\"macos\"* ]]; then\n PLATFORM=\"MAC_ARM64\"\n DEPLOYMENT_TARGET=\"10.15\"\nfi\n\ncmake_build() {\n local src_dir=$1\n local target=$2\n shift 2\n local extra_args=(\"$@\")\n local build_dir=\"$CMAKE_DIR/build/$(basename \"$src_dir\")\"\n\n mkdir -p \"$build_dir\" && cd \"$build_dir\"\n\n if [[ \"$PLATFORM\" == \"MAC_ARM64\" ]]; then\n extra_args+=(-DCMAKE_INSTALL_BUNDLEDIR=\"${CMAKE_DIR}/bin\")\n extra_args+=(-DCMAKE_MACOSX_BUNDLE=OFF)\n fi\n cmake -G Xcode \\\n -DCMAKE_BUILD_TYPE=\"Release\" \\\n -DCMAKE_CXX_STANDARD=17 \\\n -DCMAKE_TOOLCHAIN_FILE=\"$SRCROOT/../../../../third-party/ios-cmake/ios.toolchain.cmake\" \\\n -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD=\"c++17\" \\\n -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY=\"libc++\" \\\n -DPLATFORM=\"$PLATFORM\" \\\n -DDEPLOYMENT_TARGET=\"$DEPLOYMENT_TARGET\" \\\n -DCMAKE_INSTALL_PREFIX=\"$CMAKE_DIR\" \\\n \"${extra_args[@]}\" \\\n \"$src_dir\"\n cmake --build . --config \"Release\" --target \"$target\"\n if [[ \"$target\" == \"install\" ]]; then\n cmake --install . --prefix \"$CMAKE_DIR\"\n fi\n}\n\ncmake_build \"$SRCROOT/../../../llm/tokenizers/third-party/abseil-cpp\" \"install\" \\\n -DABSL_PROPAGATE_CXX_STD=ON\n\ncmake_build \"$SRCROOT/../../../llm/tokenizers/third-party/re2\" \"install\"\n\ncmake_build \"$SRCROOT/../../../llm/tokenizers/third-party/pcre2\" \"install\" \\\n -DPCRE2_BUILD_PCRE2_8=ON \\\n -DPCRE2_BUILD_PCRE2_16=OFF \\\n -DPCRE2_BUILD_PCRE2_32=OFF \\\n -DPCRE2_BUILD_TESTS=OFF \\\n -DPCRE2_BUILD_PCRE2GREP=OFF \\\n -DPCRE2_BUILD_PCRE2TEST=OFF \\\n -DPCRE2_BUILD_PCRE2GPERF=OFF \\\n -DPCRE2_BUILD_DOCS=OFF \\\n -DPCRE2_BUILD_LIBPCRE2_PDB=OFF\n \ncmake_build \"$SRCROOT/../../../llm/tokenizers/third-party/sentencepiece\" \"sentencepiece-static\" \\\n -DSPM_ENABLE_SHARED=OFF\n \ncmake_build \"$SRCROOT/../../../llm/tokenizers/third-party/llama.cpp-unicode\" \"install\"\n \n# Include the single header for json.\nmkdir -p \"$CMAKE_DIR/include/nlohmann\"\ncp \"$SRCROOT/../../../llm/tokenizers/third-party/json/single_include/nlohmann/json.hpp\" \"$CMAKE_DIR/include/nlohmann/json.hpp\"\n\necho \"$(find $CMAKE_DIR/lib -name \"*.a\" | sed -E 's|^.*/lib([^/]+)\\.a|-l\\1|g' | tr '\\n' ' ')\" > \"$CMAKE_DIR/linker_flags\"\n"; + shellScript = "set -e\n\nif ! command -v cmake &> /dev/null\nthen\n echo \"Cmake not found, please install Cmake. \\n1. Download Cmake.app from https://cmake.org/download with version > 3.19. \\n2. Install it to Applications/ folder and run sudo /Applications/CMake.app/Contents/bin/cmake-gui --install to install CMake commandline tools.\"\n exit 1\nfi\n\nCMAKE_DIR=\"$TEMP_DIR/cmake\"\nrm -rf \"$CMAKE_DIR\"\n\nPLATFORM=\"SIMULATORARM64\"\nDEPLOYMENT_TARGET=\"17.0\"\n\nif [[ \"$PLATFORM_NAME\" == *\"iphoneos\"* ]]; then\n PLATFORM=\"OS64\"\nelif [[ \"$PLATFORM_NAME\" == *\"macos\"* ]]; then\n PLATFORM=\"MAC_ARM64\"\n DEPLOYMENT_TARGET=\"12.0\"\nfi\n\ncmake_build() {\n local src_dir=$1\n local target=$2\n shift 2\n local extra_args=(\"$@\")\n local build_dir=\"$CMAKE_DIR/build/$(basename \"$src_dir\")\"\n\n mkdir -p \"$build_dir\" && cd \"$build_dir\"\n\n if [[ \"$PLATFORM\" == \"MAC_ARM64\" ]]; then\n extra_args+=(-DCMAKE_INSTALL_BUNDLEDIR=\"${CMAKE_DIR}/bin\")\n extra_args+=(-DCMAKE_MACOSX_BUNDLE=OFF)\n fi\n cmake -G Xcode \\\n -DCMAKE_BUILD_TYPE=\"Release\" \\\n -DCMAKE_CXX_STANDARD=17 \\\n -DCMAKE_TOOLCHAIN_FILE=\"$SRCROOT/../../../../third-party/ios-cmake/ios.toolchain.cmake\" \\\n -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD=\"c++17\" \\\n -DCMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY=\"libc++\" \\\n -DPLATFORM=\"$PLATFORM\" \\\n -DDEPLOYMENT_TARGET=\"$DEPLOYMENT_TARGET\" \\\n -DCMAKE_INSTALL_PREFIX=\"$CMAKE_DIR\" \\\n \"${extra_args[@]}\" \\\n \"$src_dir\"\n cmake --build . --config \"Release\" --target \"$target\"\n if [[ \"$target\" == \"install\" ]]; then\n cmake --install . --prefix \"$CMAKE_DIR\"\n fi\n}\n\ncmake_build \"$SRCROOT/../../../llm/tokenizers/third-party/abseil-cpp\" \"install\" \\\n -DABSL_PROPAGATE_CXX_STD=ON\n\ncmake_build \"$SRCROOT/../../../llm/tokenizers/third-party/re2\" \"install\"\n\ncmake_build \"$SRCROOT/../../../llm/tokenizers/third-party/pcre2\" \"install\" \\\n -DPCRE2_BUILD_PCRE2_8=ON \\\n -DPCRE2_BUILD_PCRE2_16=OFF \\\n -DPCRE2_BUILD_PCRE2_32=OFF \\\n -DPCRE2_BUILD_TESTS=OFF \\\n -DPCRE2_BUILD_PCRE2GREP=OFF \\\n -DPCRE2_BUILD_PCRE2TEST=OFF \\\n -DPCRE2_BUILD_PCRE2GPERF=OFF \\\n -DPCRE2_BUILD_DOCS=OFF \\\n -DPCRE2_BUILD_LIBPCRE2_PDB=OFF\n \ncmake_build \"$SRCROOT/../../../llm/tokenizers/third-party/sentencepiece\" \"sentencepiece-static\" \\\n -DSPM_ENABLE_SHARED=OFF\n \ncmake_build \"$SRCROOT/../../../llm/tokenizers/third-party/llama.cpp-unicode\" \"install\"\n \n# Include the single header for json.\nmkdir -p \"$CMAKE_DIR/include/nlohmann\"\ncp \"$SRCROOT/../../../llm/tokenizers/third-party/json/single_include/nlohmann/json.hpp\" \"$CMAKE_DIR/include/nlohmann/json.hpp\"\n\necho \"$(find $CMAKE_DIR/lib -name \"*.a\" | sed -E 's|^.*/lib([^/]+)\\.a|-l\\1|g' | tr '\\n' ' ')\" > \"$CMAKE_DIR/linker_flags\"\n"; }; /* End PBXShellScriptBuildPhase section */ @@ -632,7 +632,7 @@ DEVELOPMENT_TEAM = ""; GENERATE_INFOPLIST_FILE = YES; IPHONEOS_DEPLOYMENT_TARGET = 17.0; - MACOSX_DEPLOYMENT_TARGET = 10.15; + MACOSX_DEPLOYMENT_TARGET = 12.0; MARKETING_VERSION = 1.0; OTHER_CODE_SIGN_FLAGS = "--deep"; PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.BenchmarkTests; @@ -659,7 +659,7 @@ DEVELOPMENT_TEAM = ""; GENERATE_INFOPLIST_FILE = YES; IPHONEOS_DEPLOYMENT_TARGET = 17.0; - MACOSX_DEPLOYMENT_TARGET = 10.15; + MACOSX_DEPLOYMENT_TARGET = 12.0; MARKETING_VERSION = 1.0; OTHER_CODE_SIGN_FLAGS = "--deep"; PRODUCT_BUNDLE_IDENTIFIER = org.pytorch.executorch.BenchmarkTests; diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers index 57eb76d71d6..fc5962cd9e0 160000 --- a/extension/llm/tokenizers +++ b/extension/llm/tokenizers @@ -1 +1 @@ -Subproject commit 57eb76d71d6dde5396520c7d35142eb868994e06 +Subproject commit fc5962cd9e08019c5df6667eba3377e7d76441f7 diff --git a/extension/pybindings/README.md b/extension/pybindings/README.md index 8675993264d..2cd680e7bb9 100644 --- a/extension/pybindings/README.md +++ b/extension/pybindings/README.md @@ -2,28 +2,18 @@ This Python module, named `portable_lib`, provides a set of functions and classes for loading and executing bundled programs. To install it, run the fullowing command: ```bash -CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON" pip install . --no-build-isolation -``` - -Or when installing the rest of dependencies: +./install_executorch.sh -```bash -install_executorch.sh --pybind +# ...or use pip directly +pip install . --no-build-isolation ``` # Link Backends -You can link the runtime against some backends to make sure a delegated or partitioned model can still run by Python module successfully: - -```bash -CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_MPS=ON" \ - pip install . --no-build-isolation -``` - -Similarly, when installing the rest of dependencies: +Not all backends are built into the pip wheel by default. You can link these missing/experimental backends by turning on the corresponding cmake flag. For example, to include the MPS backend: ```bash -install_executorch.sh --pybind xnnpack coreml mps +CMAKE_ARGS="-DEXECUTORCH_BUILD_MPS=ON" ./install_executorch.sh ``` ## Functions diff --git a/extension/threadpool/cpuinfo_utils.cpp b/extension/threadpool/cpuinfo_utils.cpp index 21862fbd4aa..599527a885e 100644 --- a/extension/threadpool/cpuinfo_utils.cpp +++ b/extension/threadpool/cpuinfo_utils.cpp @@ -16,6 +16,10 @@ #include +#if defined(__APPLE__) && defined(__aarch64__) +#include +#endif + namespace executorch::extension::cpuinfo { // Ignore revisions (last digit (4 LSBs)) @@ -33,6 +37,11 @@ bool is_non_performant_core(const struct cpuinfo_uarch_info* uarch_info) { case cpuinfo_uarch_cortex_a53: case cpuinfo_uarch_cortex_a510: case cpuinfo_uarch_icestorm: + case cpuinfo_uarch_blizzard: + case cpuinfo_uarch_sawtooth: + case cpuinfo_uarch_coll_sawtooth: + case cpuinfo_uarch_tupai_sawtooth: + case cpuinfo_uarch_tahiti_sawtooth: return true; // This can be so many other cores. // Need to update this to better account for slow cores @@ -167,6 +176,23 @@ uint32_t get_num_performant_cores() { // In one plua 12 while it has 2 little cores, the topology // reported in /sys/devices/system/cpu/cpu* /topology/core_siblings_list // report wrong topology which results in wront configratuon +#if defined(__aarch64__) && defined(__APPLE__) + // Copied from ATen/ParallelCommon.cpp + // On Apple Silicon there are efficient and performance core + // Restrict parallel algorithms to performance cores by default + int32_t num_cores = -1; + size_t num_cores_len = sizeof(num_cores); + if (sysctlbyname( + "hw.perflevel0.physicalcpu", + &num_cores, + &num_cores_len, + nullptr, + 0) == 0) { + if (num_cores > 1) { + return static_cast(num_cores); + } + } +#endif return _get_num_performant_cores(); } } diff --git a/install_executorch.py b/install_executorch.py index 4c7b51ef239..b46c9808ba6 100644 --- a/install_executorch.py +++ b/install_executorch.py @@ -8,14 +8,12 @@ import argparse import glob -import itertools import logging import os import shutil import subprocess import sys from contextlib import contextmanager -from typing import List, Tuple from install_requirements import ( install_requirements, @@ -52,10 +50,6 @@ def clean(): print("Done cleaning build artifacts.") -# Please keep this insync with `ShouldBuild.pybindings` in setup.py. -VALID_PYBINDS = ["coreml", "mps", "xnnpack", "training", "openvino"] - - ################################################################################ # Git submodules ################################################################################ @@ -139,14 +133,9 @@ def check_folder(folder: str, file: str) -> bool: logger.info("All required submodules are present.") -def build_args_parser() -> argparse.ArgumentParser: - # Parse options. - parser = argparse.ArgumentParser() - parser.add_argument( - "--pybind", - action="append", - nargs="+", - help="one or more of coreml/mps/xnnpack, or off", +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Install executorch in your Python environment." ) parser.add_argument( "--clean", @@ -166,83 +155,34 @@ def build_args_parser() -> argparse.ArgumentParser: "picked up without rebuilding the wheel. Extension libraries will be " "installed inside the source tree.", ) - return parser - - -# Returns (wants_off, wanted_pybindings) -def _list_pybind_defines(args) -> Tuple[bool, List[str]]: - if args.pybind is None: - return False, [] - - # Flatten list of lists. - args.pybind = list(itertools.chain(*args.pybind)) - if "off" in args.pybind: - if len(args.pybind) != 1: - raise Exception(f"Cannot combine `off` with other pybinds: {args.pybind}") - return True, [] - - cmake_args = [] - for pybind_arg in args.pybind: - if pybind_arg not in VALID_PYBINDS: - raise Exception( - f"Unrecognized pybind argument {pybind_arg}; valid options are: {', '.join(VALID_PYBINDS)}" - ) - if pybind_arg == "training": - cmake_args.append("-DEXECUTORCH_BUILD_EXTENSION_TRAINING=ON") - else: - cmake_args.append(f"-DEXECUTORCH_BUILD_{pybind_arg.upper()}=ON") - - return False, cmake_args + return parser.parse_args() def main(args): if not python_is_compatible(): sys.exit(1) - parser = build_args_parser() - args = parser.parse_args() - - cmake_args = [os.getenv("CMAKE_ARGS", "")] - use_pytorch_nightly = True - - wants_pybindings_off, pybind_defines = _list_pybind_defines(args) - if wants_pybindings_off: - cmake_args.append("-DEXECUTORCH_BUILD_PYBIND=OFF") - else: - cmake_args += pybind_defines + args = _parse_args() if args.clean: clean() return - if args.use_pt_pinned_commit: - # This option is used in CI to make sure that PyTorch build from the pinned commit - # is used instead of nightly. CI jobs wouldn't be able to catch regression from the - # latest PT commit otherwise - use_pytorch_nightly = False - + cmake_args = [os.getenv("CMAKE_ARGS", "")] # Use ClangCL on Windows. # ClangCL is an alias to Clang that configures it to work in an MSVC-compatible # mode. Using it on Windows to avoid compiler compatibility issues for MSVC. if os.name == "nt": cmake_args.append("-T ClangCL") - - # - # Install executorch pip package. This also makes `flatc` available on the path. - # The --extra-index-url may be necessary if pyproject.toml has a dependency on a - # pre-release or nightly version of a torch package. - # - - # Set environment variables os.environ["CMAKE_ARGS"] = " ".join(cmake_args) - # Check if the required submodules are present and update them if not check_and_update_submodules() - - install_requirements(use_pytorch_nightly) - - # Run the pip install command - subprocess.run( + # This option is used in CI to make sure that PyTorch build from the pinned commit + # is used instead of nightly. CI jobs wouldn't be able to catch regression from the + # latest PT commit otherwise + install_requirements(use_pytorch_nightly=not args.use_pt_pinned_commit) + os.execvp( + sys.executable, [ sys.executable, "-m", @@ -257,14 +197,10 @@ def main(args): "--extra-index-url", TORCH_NIGHTLY_URL, ], - check=True, ) if __name__ == "__main__": # Before doing anything, cd to the directory containing this script. os.chdir(os.path.dirname(os.path.abspath(__file__))) - if not python_is_compatible(): - sys.exit(1) - main(sys.argv[1:]) diff --git a/install_requirements.py b/install_requirements.py index 38188d08300..f60020dbbbf 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -71,7 +71,7 @@ def python_is_compatible(): # # NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt # by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/ -NIGHTLY_VERSION = "dev20250602" +NIGHTLY_VERSION = "dev20250601" def install_requirements(use_pytorch_nightly): diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml index 48a8d3bc8ee..77bf9cd573b 100644 --- a/kernels/aten/functions.yaml +++ b/kernels/aten/functions.yaml @@ -315,6 +315,10 @@ - op: prod.out +- op: rand.out + +- op: randn.out + - op: reciprocal.out - op: relu.out diff --git a/kernels/optimized/cpu/op_linear.cpp b/kernels/optimized/cpu/op_linear.cpp index 210000b384d..d81bfd8643f 100644 --- a/kernels/optimized/cpu/op_linear.cpp +++ b/kernels/optimized/cpu/op_linear.cpp @@ -6,17 +6,69 @@ * LICENSE file in the root directory of this source tree. */ +#include + +#include + #include +#include +#include #include #include -#include - namespace torch { namespace executor { namespace native { -using Tensor = executorch::aten::Tensor; +namespace { +using ::executorch::aten::Tensor; +using ::executorch::cpublas::gemm; +using ::executorch::cpublas::TransposeType; +using ::executorch::runtime::toString; +using ::executorch::vec::map; +using ::executorch::vec::Vectorized; + +// Use vector store to initialize with scalar bias. +template +void initialize_scalar( + const ssize_t out_numel, + const scalar_t init, + scalar_t* out) { + using Vec = Vectorized; + + // Initialize a vector with the scalar initial value. + Vec init_vec(init); + + ssize_t d = 0; + for (; d < out_numel - (out_numel % Vec::size()); d += Vec::size()) { + // Vector-length store. + init_vec.store(out + d); + } + if (out_numel - d > 0) { + // Sub-vector-length store. + init_vec.store(out + d, static_cast(out_numel - d)); + } +} + +// Use std::memcpy to initialize with vector bias. +template +void initialize_to_vector( + const ssize_t n, + const ssize_t m, + const scalar_t* bias, + scalar_t* out) { + // Output is a n x m x scalar_t, while bias is m x scalar_t. + const size_t row_size = static_cast(m) * sizeof(scalar_t); + for (const auto col : c10::irange(n)) { + std::memcpy( + // Point to Column `col` of the output tensor. + out + col * m, + bias, + row_size); + } +} + +} // namespace Tensor& opt_linear_out( RuntimeContext& ctx, @@ -24,12 +76,6 @@ Tensor& opt_linear_out( const Tensor& mat2, const optional& bias, Tensor& out) { - ET_KERNEL_CHECK_MSG( - ctx, - !bias.has_value(), - InvalidArgument, - out, - "bias not supported yet in linear"); ET_KERNEL_CHECK(ctx, check_linear_args(in, mat2, out), InvalidArgument, out); size_t output_ndim = 0; @@ -46,28 +92,74 @@ Tensor& opt_linear_out( return out; } - int flattened_input_dim = 1; + ssize_t n = 1; for (int ii = 0; ii < in.dim() - 1; ++ii) { - flattened_input_dim *= in.sizes()[ii]; + n *= in.sizes()[ii]; } + const ssize_t k = in.sizes()[in.dim() - 1]; + const ssize_t m = mat2.size(0); + + if (bias.has_value()) { + ET_KERNEL_CHECK_MSG( + ctx, + // Bias and output dtype must match. + bias->dtype() == out.dtype(), + InvalidArgument, + out, + "Bias has wrong dtype! Expected bias dtype to be the same as out dtype %s" + " but got %s", + toString(bias->dtype()), + toString(out.dtype())); + + ET_KERNEL_CHECK_MSG( + ctx, + // Either no bias or bias is a 1D tensor of size m or 1. + bias->dim() == 1 && (bias->size(0) == m || bias->size(0) == 1), + InvalidArgument, + out, + "Bias has wrong dimensionality! Expected 1-D tensor of size %d or empty," + " but got %d-D tensor with %d elements", + static_cast(m), + static_cast(bias->dim()), + static_cast(bias->numel())); + } + ET_SWITCH_REAL_TYPES_AND2( - Half, BFloat16, in.scalar_type(), ctx, "mm.out", CTYPE, [&]() { - size_t n = flattened_input_dim; - size_t k = in.sizes()[in.dim() - 1]; - size_t m = mat2.size(0); - - executorch::cpublas::gemm( - executorch::cpublas::TransposeType::Transpose, - executorch::cpublas::TransposeType::NoTranspose, + Half, BFloat16, out.scalar_type(), ctx, "linear.out", CTYPE, [&] { + // Fill output with bias if it is provided. + if (bias.has_value() && bias->numel() == 1) { + // Scalar version of initialization. + initialize_scalar( + out.numel(), + *bias->const_data_ptr(), + out.mutable_data_ptr()); + } else if (bias.has_value()) { + // Assume bias is a 1D tensor of size m. + initialize_to_vector( + n, + m, + bias->const_data_ptr(), + out.mutable_data_ptr()); + } + + // Set beta to 1 if bias was applied so that GEMM adds to the pre-filled + // bias, otherwise beta remains 0 (i.e. the output is fully overwritten + // by GEMM). + const CTYPE beta = + bias.has_value() ? static_cast(1) : static_cast(0); + + gemm( + /*transa=*/TransposeType::Transpose, + /*transb=*/TransposeType::NoTranspose, m, n, k, - static_cast(1), + /*alpha=*/static_cast(1), mat2.const_data_ptr(), k, in.const_data_ptr(), k, - static_cast(0), + beta, out.mutable_data_ptr(), m); }); diff --git a/kernels/portable/cpu/op_rand.cpp b/kernels/portable/cpu/op_rand.cpp new file mode 100644 index 00000000000..ba9b160019e --- /dev/null +++ b/kernels/portable/cpu/op_rand.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include + +#include +#include + +#include + +namespace torch { +namespace executor { +namespace native { + +using executorch::aten::IntArrayRef; +using Tensor = executorch::aten::Tensor; +using ScalarType = executorch::aten::ScalarType; + +Tensor& +rand_out(KernelRuntimeContext& ctx, const IntArrayRef sizes, Tensor& out) { + (void)ctx; + + std::mt19937 gen((std::random_device())()); + std::uniform_real_distribution dist(0.0, 1.0); + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, sizes) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, "randn.out", CTYPE, [&] { + auto data_out = out.mutable_data_ptr(); + for (const auto i : c10::irange(out.numel())) { + data_out[i] = static_cast(dist(gen)); + } + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/portable/cpu/op_randn.cpp b/kernels/portable/cpu/op_randn.cpp new file mode 100644 index 00000000000..a0732e7f177 --- /dev/null +++ b/kernels/portable/cpu/op_randn.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include + +#include +#include + +#include + +namespace torch { +namespace executor { +namespace native { + +using executorch::aten::IntArrayRef; +using Tensor = executorch::aten::Tensor; +using ScalarType = executorch::aten::ScalarType; + +Tensor& +randn_out(KernelRuntimeContext& ctx, const IntArrayRef sizes, Tensor& out) { + (void)ctx; + + std::mt19937 gen((std::random_device())()); + std::normal_distribution dist(0.0, 1.0); + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, sizes) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, "randn.out", CTYPE, [&] { + auto data_out = out.mutable_data_ptr(); + for (const auto i : c10::irange(out.numel())) { + data_out[i] = static_cast(dist(gen)); + } + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index ecd6a771646..feaee415f91 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -713,6 +713,18 @@ - arg_meta: null kernel_name: torch::executor::prod_out +- op: rand.out + kernels: + - arg_meta: null + kernel_name: torch::executor::rand_out + tags: nondeterministic_seeded + +- op: randn.out + kernels: + - arg_meta: null + kernel_name: torch::executor::randn_out + tags: nondeterministic_seeded + - op: reciprocal.out kernels: - arg_meta: null diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt index 6cd34773d14..4f174b5a652 100644 --- a/kernels/test/CMakeLists.txt +++ b/kernels/test/CMakeLists.txt @@ -197,6 +197,8 @@ set(all_test_sources "op_permute_copy_test.cpp" "op_pixel_shuffle_test.cpp" "op_prod_test.cpp" + "op_rand_test.cpp" + "op_randn_test.cpp" "op_reciprocal_test.cpp" "op_relu_test.cpp" "op_remainder_test.cpp" diff --git a/kernels/test/op_linear_test.cpp b/kernels/test/op_linear_test.cpp index d894c5a818a..0ad5790a550 100644 --- a/kernels/test/op_linear_test.cpp +++ b/kernels/test/op_linear_test.cpp @@ -18,7 +18,8 @@ #include #include -using namespace ::testing; +namespace { + using executorch::aten::ArrayRef; using executorch::aten::Scalar; using executorch::aten::ScalarType; @@ -31,7 +32,15 @@ class OpLinearOutTest : public OperatorTest { return torch::executor::aten::linear_outf(context_, self, mat2, {}, out); } - template + Tensor& op_linear_out( + const Tensor& self, + const Tensor& mat2, + const Tensor& bias, + Tensor& out) { + return torch::executor::aten::linear_outf(context_, self, mat2, bias, out); + } + + template void test_dtype() { TensorFactory tf; @@ -43,16 +52,16 @@ class OpLinearOutTest : public OperatorTest { } } - // matmul gives 32 * 2 * 3 = 192 - Tensor x = tf.full({3, 32}, 2); - Tensor y = tf.full({5, 32}, 3); + // matmul gives 19 * 2 * 3 = 114 + Tensor x = tf.full({3, 19}, 2); + Tensor y = tf.full({5, 19}, 3); // Output shape should be (3, 5) Tensor out = tf.zeros({3, 5}); op_linear_out(x, y, out); - Tensor expected = tf.full({3, 5}, 192); + Tensor expected = tf.full({3, 5}, 114); EXPECT_TENSOR_EQ(out, expected); } @@ -88,6 +97,80 @@ TEST_F(OpLinearOutTest, AllDtypesSupported) { // for those types. } +TEST_F(OpLinearOutTest, BiasTest) { + TensorFactory tf; + + // Initialize input tensors. + constexpr int kReduceDim = 4; + constexpr int kDimX = 3, kDimY = 2; + constexpr int kValueX = 1; + constexpr int kValueY = 2; + constexpr int kValueBias0 = 4, kValueBias1 = 7; + const Tensor x = tf.full({kDimX, kReduceDim}, kValueX); + const Tensor y = tf.full({kDimY, kReduceDim}, kValueY); + const Tensor b = tf.make({kDimY}, {kValueBias0, kValueBias1}); + // Output matrix is also empty + Tensor out = tf.zeros({kDimX, kDimY}); + // Initialize expected tensor. + constexpr int kValueExpected0 = kValueX * kValueY * kReduceDim + kValueBias0; + constexpr int kValueExpected1 = kValueX * kValueY * kReduceDim + kValueBias1; + // Check that the bias is added to the correct position in the output matrix. + const Tensor expected = tf.make( + {kDimX, kDimY}, + {kValueExpected0, + kValueExpected1, + kValueExpected0, + kValueExpected1, + kValueExpected0, + kValueExpected1}); + + EXPECT_TENSOR_EQ(op_linear_out(x, y, b, out), expected); +} + +TEST_F(OpLinearOutTest, BiasBroadcastTest) { + TensorFactory tf; + + // Initialize input tensors. + constexpr int kReduceDim = 4; + constexpr int kDimX = 3, kDimY = 5; + constexpr int kValueX = 1; + constexpr int kValueY = 2; + constexpr int kValueBias = 4; + const Tensor x = tf.full({kDimX, kReduceDim}, kValueX); + const Tensor y = tf.full({kDimY, kReduceDim}, kValueY); + const Tensor b = tf.full({1}, kValueBias); + // Output matrix is also empty + Tensor out = tf.zeros({kDimX, kDimY}); + // Initialize expected tensor. + constexpr int kValueExpected = kValueX * kValueY * kReduceDim + kValueBias; + const Tensor expected = tf.full({kDimX, kDimY}, kValueExpected); + + EXPECT_TENSOR_EQ(op_linear_out(x, y, b, out), expected); +} + +TEST_F(OpLinearOutTest, BiasDtypeMismatch) { + TensorFactory tf; + TensorFactory tf_bias; + + // Initialize input tensors. + constexpr int kReduceDim = 4; + constexpr int kDimX = 3, kDimY = 5; + constexpr int kValueX = 1; + constexpr int kValueY = 2; + constexpr int kValueBias = 4; + Tensor x = tf.full({kDimX, kReduceDim}, kValueX); + Tensor y = tf.full({kDimY, kReduceDim}, kValueY); + // Same size as output. + Tensor b = tf_bias.full({kDimY}, kValueBias); + // Output matrix is also empty + Tensor out = tf.zeros({kDimX, kDimY}); + // Initialize expected tensor. + constexpr int kValueExpected = kValueX * kValueY * kReduceDim + kValueBias; + Tensor expected = tf.full({kDimX, kDimY}, kValueExpected); + + ET_EXPECT_KERNEL_FAILURE(context_, op_linear_out(x, y, b, out)); +} + TEST_F(OpLinearOutTest, EmptyInputWithEmptyOutTensorPasses) { TensorFactory tf; @@ -297,5 +380,4 @@ TEST_F(OpLinearOutTest, DynamicShapeUnbound) { Tensor ret = op_linear_out(x, y, out); EXPECT_TENSOR_CLOSE(out, expected_result); } - -// TODO: support and test bias +} // namespace diff --git a/kernels/test/op_rand_test.cpp b/kernels/test/op_rand_test.cpp new file mode 100644 index 00000000000..7450ed6a242 --- /dev/null +++ b/kernels/test/op_rand_test.cpp @@ -0,0 +1,95 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include // Declares the operator +#include +#include +#include +#include + +#include + +#include +#include + +using executorch::aten::IntArrayRef; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using torch::executor::testing::TensorFactory; + +class OpRandTest : public OperatorTest { + protected: + void op_rand_out(const IntArrayRef sizes, Tensor& out) { + torch::executor::aten::rand_outf(context_, sizes, out); + } + + template + void test_rand(std::vector& sizes) { + TensorFactory tf; + + // Tensor factory wants int32 scales, op kernel wants int64. + std::vector sizes_i32; + std::transform( + sizes.begin(), + sizes.end(), + std::back_inserter(sizes_i32), + [](int64_t s) { return static_cast(s); }); + Tensor out = tf.zeros(sizes_i32); + + IntArrayRef sizes_ref(sizes.data(), sizes.size()); + op_rand_out(sizes_ref, out); + + // Check mean and standard deviation. To avoid flaky CI, test pretty + // loosely. + auto out_data = out.const_data_ptr(); + double mean = + std::accumulate( + out_data, + out_data + out.numel(), + 0.0, + [](double acc, CTYPE n) { return acc + static_cast(n); }) / + out.numel(); + double var = std::accumulate( + out_data, + out_data + out.numel(), + 0.0, + [=](double acc, CTYPE n) { + return acc + std::pow(static_cast(n) - mean, 2); + }) / + out.numel(); + auto stdev = std::sqrt(var); + + // These are very rough thresholds. A better test implementation would + // probably do a proper statistical test to compare the generated empirical + // data to the reference distribution, but this should do. + + // Expected mean is 0.5 + EXPECT_NEAR(mean, 0.5, 5.0 / std::sqrt(out.numel())); + // Expected stdev is 1/sqrt(12) ~= 0.289 + EXPECT_NEAR(stdev, 1.0 / std::sqrt(12), 0.1); + EXPECT_GT(stdev, 0); + } +}; + +TEST_F(OpRandTest, SmokeTest) { + std::vector sizes = {2, 3, 4, 128}; + +#define TEST_ENTRY(ctype, dtype) test_rand(sizes); + ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY); +#undef TEST_ENTRY +} + +TEST_F(OpRandTest, Rank) { + std::vector sizes = {1024}; + + for (int64_t i = 0; i < 4; i++) { + sizes.push_back(i + 1); + test_rand(sizes); + } +} diff --git a/kernels/test/op_randn_test.cpp b/kernels/test/op_randn_test.cpp new file mode 100644 index 00000000000..41456584e91 --- /dev/null +++ b/kernels/test/op_randn_test.cpp @@ -0,0 +1,93 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include // Declares the operator +#include +#include +#include +#include +#include + +#include + +#include +#include + +using executorch::aten::IntArrayRef; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using torch::executor::testing::TensorFactory; + +class OpRandnTest : public OperatorTest { + protected: + void op_randn_out(const IntArrayRef sizes, Tensor& out) { + torch::executor::aten::randn_outf(context_, sizes, out); + } + + template + void test_randn(std::vector& sizes) { + TensorFactory tf; + + // Tensor factory wants int32 scales, op kernel wants int64. + std::vector sizes_i32; + std::transform( + sizes.begin(), + sizes.end(), + std::back_inserter(sizes_i32), + [](int64_t s) { return static_cast(s); }); + Tensor out = tf.zeros(sizes_i32); + + IntArrayRef sizes_ref(sizes.data(), sizes.size()); + op_randn_out(sizes_ref, out); + + // Check mean and standard deviation. To avoid flaky CI, test pretty + // loosely. + auto out_data = out.const_data_ptr(); + double mean = + std::accumulate( + out_data, + out_data + out.numel(), + 0.0, + [](double acc, CTYPE n) { return acc + static_cast(n); }) / + out.numel(); + double var = std::accumulate( + out_data, + out_data + out.numel(), + 0.0, + [=](double acc, CTYPE n) { + return acc + std::pow(static_cast(n) - mean, 2); + }) / + out.numel(); + auto stdev = std::sqrt(var); + + // These are very rough thresholds. A better test implementation would + // probably do a proper statistical test to compare the generated empirical + // data to the reference distribution, but this should do. + EXPECT_LE(std::abs(mean), 5.0 / std::sqrt(out.numel())); + EXPECT_LE(std::abs(stdev - 1.0), 0.1); + EXPECT_GT(stdev, 0); + } +}; + +TEST_F(OpRandnTest, SmokeTest) { + std::vector sizes = {2, 3, 4, 128}; + +#define TEST_ENTRY(ctype, dtype) test_randn(sizes); + ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY); +#undef TEST_ENTRY +} + +TEST_F(OpRandnTest, Rank) { + std::vector sizes = {1024}; + + for (int64_t i = 0; i < 4; i++) { + sizes.push_back(i + 1); + test_randn(sizes); + } +} diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl index c1824674fd4..bde3b8632b0 100644 --- a/kernels/test/targets.bzl +++ b/kernels/test/targets.bzl @@ -285,6 +285,8 @@ def define_common_targets(): _common_op_test("op_pixel_unshuffle_test", ["aten", "portable"]) _common_op_test("op_pow_test", ["aten", "portable"]) _common_op_test("op_prod_test", ["aten", "portable"]) + _common_op_test("op_rand_test", ["aten", "portable"]) + _common_op_test("op_randn_test", ["aten", "portable"]) _common_op_test("op_reciprocal_test", ["aten", "portable"]) _common_op_test("op_relu_test", ["aten", "portable"]) _common_op_test("op_remainder_test", ["aten", "portable"]) diff --git a/pytest.ini b/pytest.ini index 4dd7f4353d2..557a307bdf2 100644 --- a/pytest.ini +++ b/pytest.ini @@ -47,6 +47,8 @@ addopts = --ignore=backends/xnnpack/test/ops/test_sdpa.py backends/xnnpack/test/passes backends/xnnpack/test/serialization + # backends/apple/coreml + backends/apple/coreml/test # extension/ extension/llm/modules/test extension/llm/export diff --git a/runtime/backend/backend_init_context.h b/runtime/backend/backend_init_context.h index 71c5182f401..5a4b70e0dbc 100644 --- a/runtime/backend/backend_init_context.h +++ b/runtime/backend/backend_init_context.h @@ -25,8 +25,14 @@ class BackendInitContext final { const char* method_name = nullptr, const NamedDataMap* named_data_map = nullptr) : runtime_allocator_(runtime_allocator), +#ifdef ET_EVENT_TRACER_ENABLED + event_tracer_(event_tracer), +#else + event_tracer_(nullptr), +#endif method_name_(method_name), - named_data_map_(named_data_map) {} + named_data_map_(named_data_map) { + } /** Get the runtime allocator passed from Method. It's the same runtime * executor used by the standard executor runtime and the life span is the diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl index 4088110246d..827a63d2cef 100644 --- a/runtime/core/portable_type/c10/c10/targets.bzl +++ b/runtime/core/portable_type/c10/c10/targets.bzl @@ -65,7 +65,6 @@ def define_common_targets(): fbcode_exported_deps = ([ "//caffe2:aten-headers-cpu", "//caffe2:generated-config-header", - "//caffe2:torch_standalone_headers", "//caffe2/c10:c10_headers", ] + select({ "DEFAULT": [], @@ -84,7 +83,6 @@ def define_common_targets(): ] + get_sleef_preprocessor_flags(), xplat_exported_deps = [ "//xplat/caffe2:aten_header", - "//xplat/caffe2:torch_standalone_headers", "//xplat/caffe2/c10:c10_headers", ] + ["//xplat/caffe2:ovrsource_aten_Config.h" if is_arvr_mode() else "//xplat/caffe2:generated_aten_config_header",], exported_preprocessor_flags = select({ diff --git a/scripts/build_apple_frameworks.sh b/scripts/build_apple_frameworks.sh index a43deed9ab7..fd457d9f21c 100755 --- a/scripts/build_apple_frameworks.sh +++ b/scripts/build_apple_frameworks.sh @@ -65,11 +65,6 @@ liboptimized_native_cpu_ops_lib.a,\ libportable_kernels.a,\ :" -FRAMEWORK_KERNELS_PORTABLE="kernels_portable:\ -libportable_kernels.a,\ -libportable_ops_lib.a,\ -:" - FRAMEWORK_KERNELS_QUANTIZED="kernels_quantized:\ libquantized_kernels.a,\ libquantized_ops_lib.a,\ @@ -86,7 +81,6 @@ usage() { echo " --custom Only build the Custom kernels." echo " --mps Only build the Metal Performance Shaders backend." echo " --optimized Only build the Optimized kernels." - echo " --portable Only build the Portable kernels." echo " --quantized Only build the Quantized kernels." echo " --xnnpack Only build the XNNPACK backend." echo @@ -104,7 +98,6 @@ set_cmake_options_override() { "-DEXECUTORCH_BUILD_KERNELS_CUSTOM=OFF" "-DEXECUTORCH_BUILD_MPS=OFF" "-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=OFF" - "-DEXECUTORCH_BUILD_PORTABLE_OPS=OFF" "-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=OFF" "-DEXECUTORCH_BUILD_XNNPACK=OFF" ) @@ -135,7 +128,6 @@ for arg in "$@"; do --custom) set_cmake_options_override "EXECUTORCH_BUILD_KERNELS_CUSTOM" ;; --mps) set_cmake_options_override "EXECUTORCH_BUILD_MPS" ;; --optimized) set_cmake_options_override "EXECUTORCH_BUILD_KERNELS_OPTIMIZED" ;; - --portable) set_cmake_options_override "EXECUTORCH_BUILD_PORTABLE_OPS" ;; --quantized) set_cmake_options_override "EXECUTORCH_BUILD_KERNELS_QUANTIZED" ;; --xnnpack) set_cmake_options_override "EXECUTORCH_BUILD_XNNPACK" ;; *) @@ -240,7 +232,6 @@ for mode in "${MODES[@]}"; do append_framework_flag "EXECUTORCH_BUILD_XNNPACK" "$FRAMEWORK_BACKEND_XNNPACK" "$mode" append_framework_flag "EXECUTORCH_BUILD_KERNELS_CUSTOM" "$FRAMEWORK_KERNELS_CUSTOM" "$mode" append_framework_flag "EXECUTORCH_BUILD_KERNELS_OPTIMIZED" "$FRAMEWORK_KERNELS_OPTIMIZED" "$mode" - append_framework_flag "EXECUTORCH_BUILD_PORTABLE_OPS" "$FRAMEWORK_KERNELS_PORTABLE" "$mode" append_framework_flag "EXECUTORCH_BUILD_KERNELS_QUANTIZED" "$FRAMEWORK_KERNELS_QUANTIZED" "$mode" cd "${OUTPUT_DIR}" diff --git a/scripts/test_ios.sh b/scripts/test_ios.sh index 245f7b06f7a..b2b3ce94e35 100755 --- a/scripts/test_ios.sh +++ b/scripts/test_ios.sh @@ -60,10 +60,6 @@ say "Installing CoreML Backend Requirements" ./backends/apple/coreml/scripts/install_requirements.sh -say "Installing MPS Backend Requirements" - -./backends/apple/mps/install_requirements.sh - say "Exporting Models" python3 -m examples.portable.scripts.export --model_name="$MODEL_NAME" diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl index 4e379942c52..a731ce5c674 100644 --- a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl +++ b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl @@ -973,6 +973,22 @@ ATEN_OPS = ( "//executorch/kernels/portable/cpu/util:reduce_util", ], ), + op_target( + name = "op_rand", + deps = [ + ":scalar_utils", + "//executorch/runtime/core/exec_aten/util:scalar_type_util", + "//executorch/runtime/core/exec_aten/util:tensor_util", + ] + ), + op_target( + name = "op_randn", + deps = [ + ":scalar_utils", + "//executorch/runtime/core/exec_aten/util:scalar_type_util", + "//executorch/runtime/core/exec_aten/util:tensor_util", + ] + ), op_target( name = "op_reciprocal", deps = [ diff --git a/third-party/TARGETS b/third-party/TARGETS index c80bd9448b3..5dd76288a9b 100644 --- a/third-party/TARGETS +++ b/third-party/TARGETS @@ -81,18 +81,6 @@ runtime.python_binary( _is_external_target = True, ) -runtime.python_binary( - name = "gen_executorch", - main_module = "torchgen.gen_executorch", - visibility = [ - "PUBLIC", - ], - deps = [ - ":torchgen", - ], - _is_external_target = True, -) - runtime.filegroup( name = "aten_src_path", srcs = [ diff --git a/tools/cmake/Codegen.cmake b/tools/cmake/Codegen.cmake index ab616a5188d..f1dac84de43 100644 --- a/tools/cmake/Codegen.cmake +++ b/tools/cmake/Codegen.cmake @@ -91,8 +91,9 @@ function(generate_bindings_for_kernels) OUTPUT_STRIP_TRAILING_WHITESPACE ) file(GLOB_RECURSE _torchgen_srcs "${torchgen-out}/*.py") + # Not using module executorch.codegen.gen because it's not installed yet. set(_gen_command - "${PYTHON_EXECUTABLE}" -m torchgen.gen_executorch + "${PYTHON_EXECUTABLE}" -m codegen.gen --source-path=${EXECUTORCH_ROOT}/codegen --install-dir=${_out_dir} --tags-path=${torchgen-out}/packaged/ATen/native/tags.yaml --aten-yaml-path=${torchgen-out}/packaged/ATen/native/native_functions.yaml