diff --git a/.ci/scripts/setup-windows.ps1 b/.ci/scripts/setup-windows.ps1 index 20d29e4f558..329e81b3cf0 100644 --- a/.ci/scripts/setup-windows.ps1 +++ b/.ci/scripts/setup-windows.ps1 @@ -1,5 +1,5 @@ param ( - [string]$editable = $false + [string]$editable = "false" ) conda create --yes --quiet -n et python=3.12 diff --git a/.ci/scripts/test_torchao_huggingface_checkpoints.sh b/.ci/scripts/test_torchao_huggingface_checkpoints.sh index c0910b47826..3c9ac598f8f 100644 --- a/.ci/scripts/test_torchao_huggingface_checkpoints.sh +++ b/.ci/scripts/test_torchao_huggingface_checkpoints.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -set -euo pipefail +set -euxo pipefail # ------------------------- # Args / flags diff --git a/.ci/scripts/unittest-windows.ps1 b/.ci/scripts/unittest-windows.ps1 index 65ed303051b..6f1365bc3fc 100644 --- a/.ci/scripts/unittest-windows.ps1 +++ b/.ci/scripts/unittest-windows.ps1 @@ -1,15 +1,38 @@ param ( - [string]$editable = $false + [string]$buildMode = "Release" ) Set-PSDebug -Trace 1 $ErrorActionPreference = 'Stop' $PSNativeCommandUseErrorActionPreference = $true -# Run pytest with coverage -# pytest -n auto --cov=./ --cov-report=xml -pytest -v --full-trace -c pytest-windows.ini +# Run native unit tests (via ctest) +New-Item -Path "test-build" -ItemType Directory +cd "test-build" + +cmake .. --preset windows -B . -DEXECUTORCH_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=$buildMode if ($LASTEXITCODE -ne 0) { - Write-Host "Pytest invocation was unsuccessful. Exit code: $LASTEXITCODE." + Write-Host "CMake configuration was unsuccessful. Exit code: $LASTEXITCODE." + exit $LASTEXITCODE +} + +cmake --build . -j8 --config $buildMode --verbose +if ($LASTEXITCODE -ne 0) { + Write-Host "CMake build was unsuccessful. Exit code: $LASTEXITCODE." exit $LASTEXITCODE } + +ctest -j8 . --build-config $buildMode --output-on-failure -E "method_test|tensor_parser_test" +if ($LASTEXITCODE -ne 0) { + Write-Host "CTest run was unsuccessful. Exit code: $LASTEXITCODE." + exit $LASTEXITCODE +} + +cd .. + +# Run pytest +pytest -v -c pytest-windows.ini +if ($LASTEXITCODE -ne 0) { + Write-Host "Pytest invocation was unsuccessful. Exit code: $LASTEXITCODE." + exit $LASTEXITCODE +} \ No newline at end of file diff --git a/.github/workflows/_unittest.yml b/.github/workflows/_unittest.yml index a619b33dd2e..587f2cf5e5a 100644 --- a/.github/workflows/_unittest.yml +++ b/.github/workflows/_unittest.yml @@ -69,7 +69,15 @@ jobs: \$ErrorActionPreference = 'Stop' \$PSNativeCommandUseErrorActionPreference = \$true - .ci/scripts/setup-windows.ps1 + .ci/scripts/setup-windows.ps1 -editable "${{ inputs.editable }}" + if (\$LASTEXITCODE -ne 0) { + Write-Host "Setup failed. Exit code: \$LASTEXITCODE." + exit \$LASTEXITCODE + } - powershell .ci/scripts/unittest-windows.ps1 -editable "${{ inputs.editable }}" + .ci/scripts/unittest-windows.ps1 -buildMode "${{ inputs.build-mode }}" + if (\$LASTEXITCODE -ne 0) { + Write-Host "Unit tests failed. Exit code: \$LASTEXITCODE." + exit \$LASTEXITCODE + } }" diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 815e106ae1e..d8c551e8982 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -779,6 +779,7 @@ jobs: contents: read strategy: fail-fast: false + if: false # TODO Re-enable after fixing timeouts (#14314) with: runner: linux.2xlarge docker-image: ci-image:executorch-ubuntu-22.04-gcc9 diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index f5c5161e0cc..975a8ebbb30 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -1032,5 +1032,5 @@ jobs: .ci/scripts/setup-windows.ps1 - powershell .ci/scripts/test_model.ps1 -modelName ${{ matrix.model }} -backend ${{ matrix.backend }} + .ci/scripts/test_model.ps1 -modelName ${{ matrix.model }} -backend ${{ matrix.backend }} }" diff --git a/CMakeLists.txt b/CMakeLists.txt index 2664b4491c9..fc427d517a9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -143,9 +143,13 @@ endif() # -ffunction-sections -fdata-sections: breaks function and data into sections so # they can be properly gc'd. -s: strip symbol. -set(CMAKE_CXX_FLAGS_RELEASE - "-ffunction-sections -fdata-sections ${CMAKE_CXX_FLAGS_RELEASE}" -) +if(WIN32) + set(CMAKE_CXX_FLAGS_RELEASE "/Gy /Gw ${CMAKE_CXX_FLAGS_RELEASE}") +else() + set(CMAKE_CXX_FLAGS_RELEASE + "-ffunction-sections -fdata-sections ${CMAKE_CXX_FLAGS_RELEASE}" + ) +endif() if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s") endif() diff --git a/backends/arm/TARGETS b/backends/arm/TARGETS index b00e8057df6..a78ab252739 100644 --- a/backends/arm/TARGETS +++ b/backends/arm/TARGETS @@ -22,8 +22,7 @@ runtime.python_library( "common/debug.py", ], deps = [ - "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/serializer:serializer", - "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/serializer:serializer", + "fbsource//third-party/tosa_tools:serializer", "//caffe2:torch", "//executorch/exir:lib", ], @@ -37,10 +36,8 @@ runtime.python_library( deps = [ "fbsource//third-party/pypi/flatbuffers:flatbuffers", "fbsource//third-party/pypi/ml-dtypes:ml-dtypes", - "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/serializer:serializer", - "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/serializer:serializer", - "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/tosa:tosa", - "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/tosa:tosa", + "fbsource//third-party/tosa_tools:serializer", + "fbsource//third-party/tosa_tools:tosa", ":process_node", "//executorch/exir/backend:compile_spec_schema", "//executorch/backends/arm/operators:lib", @@ -83,8 +80,7 @@ runtime.python_library( name = "process_node", srcs = ["process_node.py"], deps = [ - "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/tosa:tosa", - "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/tosa:tosa", + "fbsource//third-party/tosa_tools:tosa", "//executorch/backends/arm/operators:node_visitor", "//executorch/backends/arm/tosa:mapping", "//executorch/backends/arm/tosa:quant_utils", diff --git a/backends/arm/arm_vela.py b/backends/arm/arm_vela.py index c47a5c58f49..90f9dcb8324 100644 --- a/backends/arm/arm_vela.py +++ b/backends/arm/arm_vela.py @@ -25,6 +25,8 @@ # per-io structs to simplify runtime use. def vela_bin_pack_io(prefix, data): vela_input_shapes = data[prefix + "_shape"] + # Vela input/output shape is fixed to 6D + vela_io_shape_dims = 6 ios = struct.pack(" bytes: """ Static helper method to do the compilation of the TOSA flatbuffer representation to a target specific binary stream. """ - compile_flags = [] - for spec in compile_spec: - if spec.key == "compile_flags": - compile_flags.append(spec.value.decode()) + compile_flags = compile_spec.compiler_flags if len(compile_flags) == 0: # Not testing for compile_flags correctness here, just that they are @@ -64,10 +62,11 @@ def _compile_tosa_flatbuffer( @staticmethod def preprocess( edge_program: ExportedProgram, - compile_spec: List[CompileSpec], + compile_specs: List[CompileSpec], ) -> PreprocessResult: logger.info(f"{EthosUBackend.__name__} preprocess") + compile_spec = EthosUCompileSpec.from_list(compile_specs) # deduce TOSA compile_spec from Ethos-U compile spec. We get a new # compile spec list, containing only elements relevant for the # TOSABackend. @@ -77,7 +76,7 @@ def preprocess( # ('All backend implementation are final...'), so use composition instead. # preprocess returns the serialized TOSA flatbuffer in .processed_bytes, # which can be passed on to next compilation step. - tosa_preprocess = TOSABackend.preprocess(edge_program, tosa_compile_spec) + tosa_preprocess = TOSABackend._preprocess(edge_program, tosa_compile_spec) binary = EthosUBackend._compile_tosa_flatbuffer( tosa_preprocess.processed_bytes, compile_spec diff --git a/backends/arm/operators/TARGETS b/backends/arm/operators/TARGETS index 2c255b3c17a..afe1c4dd22c 100644 --- a/backends/arm/operators/TARGETS +++ b/backends/arm/operators/TARGETS @@ -20,8 +20,7 @@ runtime.python_library( name = "ops", srcs = glob(["op_*.py", "ops_*.py"]), deps = [ - "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/tosa:tosa", - "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/tosa:tosa", + "fbsource//third-party/tosa_tools:tosa", ":node_visitor", ":operator_validation_utils", "//executorch/backends/arm/tosa:mapping", diff --git a/backends/arm/operators/op_abs.py b/backends/arm/operators/op_abs.py index 625293d66e0..ec76eb5517f 100644 --- a/backends/arm/operators/op_abs.py +++ b/backends/arm/operators/op_abs.py @@ -73,7 +73,9 @@ def define_node( abs_output = output # Do the INT32 Abs - tosa_graph.addOperator( + self._serialize_operator( + node, + tosa_graph, ts.TosaOp.Op().ABS, [ rescaled_inputs[0].name, diff --git a/backends/arm/operators/op_rescale.py b/backends/arm/operators/op_rescale.py index d7be2be737c..d331ebc80d5 100644 --- a/backends/arm/operators/op_rescale.py +++ b/backends/arm/operators/op_rescale.py @@ -46,13 +46,20 @@ def define_node( input_zp = cast(int, node.args[3]) output_zp = cast(int, node.args[4]) - if input_dtype != map_dtype(torch.int8, self.tosa_spec) and input_zp != 0: + if ( + input_dtype + not in [ + map_dtype(torch.int8, self.tosa_spec), + map_dtype(torch.int16, self.tosa_spec), + ] + and input_zp != 0 + ): raise ValueError( - f"If input dtype is not int8, input_zp must be 0. Got input_dtype{input_dtype=}, {input_zp=}" + f"If input dtype is not int8 or int16, input_zp must be 0. Got input_dtype{input_dtype=}, {input_zp=}" ) - if output_dtype != torch.int8 and output_zp != 0: + if output_dtype not in [torch.int8, torch.int16] and output_zp != 0: raise ValueError( - f"If output dtype is not int8, output_zp must be 0. Got {ts.DTypeNames[output_dtype]}, {output_zp=}" + f"If output dtype is not int8 or int16, output_zp must be 0. Got {ts.DTypeNames[output_dtype]}, {output_zp=}" ) build_rescale( diff --git a/backends/arm/operators/op_sum.py b/backends/arm/operators/op_sum.py index 0bd152a8b8c..00676d9f9b3 100644 --- a/backends/arm/operators/op_sum.py +++ b/backends/arm/operators/op_sum.py @@ -67,7 +67,9 @@ def define_node( dtype=ts.DType.INT32, ) - tosa_graph.addOperator( + self._serialize_operator( + node, + tosa_graph, ts.TosaOp.Op().REDUCE_SUM, [rescaled_inputs[0].name], [intermediate.name], @@ -111,7 +113,9 @@ def define_node( attr = ts.TosaSerializerAttribute() attr.ReduceSumAttribute(tensor.dim_order.index(dim)) - tosa_graph.addOperator( + self._serialize_operator( + node, + tosa_graph, ts.TosaOp.Op().REDUCE_SUM, [tensor.name], [output.name], diff --git a/backends/arm/requirements-arm-ethos-u.txt b/backends/arm/requirements-arm-ethos-u.txt index 5fad9d2fe94..a26fb014234 100644 --- a/backends/arm/requirements-arm-ethos-u.txt +++ b/backends/arm/requirements-arm-ethos-u.txt @@ -3,4 +3,4 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -ethos-u-vela @ git+https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-vela@d37febc1715edf0d236c2ff555739a8a9aadcf9a +ethos-u-vela == 4.4.0 diff --git a/backends/arm/requirements-arm-tosa.txt b/backends/arm/requirements-arm-tosa.txt index 4b7a3ec0273..0f9c2f702a4 100644 --- a/backends/arm/requirements-arm-tosa.txt +++ b/backends/arm/requirements-arm-tosa.txt @@ -5,5 +5,7 @@ ml_dtypes == 0.5.1 flatbuffers == 24.3.25 +tosa-adapter-model-explorer == 0.0.1 +ai-edge-model-explorer >= 0.1.16 tosa-tools @ git+https://git.gitlab.arm.com/tosa/tosa-reference-model.git@v2025.07.0 diff --git a/backends/arm/runtime/EthosUBackend.cpp b/backends/arm/runtime/EthosUBackend.cpp index bff5ff69284..8f63569eece 100644 --- a/backends/arm/runtime/EthosUBackend.cpp +++ b/backends/arm/runtime/EthosUBackend.cpp @@ -383,8 +383,8 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface { *tensor_count = *tensor_count * tensor.size(i); } - // The VelaIO type has a shape of fixed size 4 - for (int i = 0; i < 4; i++) { + // The VelaIO type has a shape of fixed size 6 + for (int i = 0; i < shapeDim; i++) { *io_count = *io_count * io->shape[i]; } } diff --git a/backends/arm/runtime/VelaBinStream.h b/backends/arm/runtime/VelaBinStream.h index 7a7ea9b6266..7f6606200b3 100644 --- a/backends/arm/runtime/VelaBinStream.h +++ b/backends/arm/runtime/VelaBinStream.h @@ -34,9 +34,11 @@ typedef struct { char data[]; // block.name specific format data } VelaBinBlock; +constexpr int shapeDim = 6; // Number of dimensions in VelaIO + // A Vela input or output descriptor in the binary stream typedef struct { - int shape[4]; // Up to 4D shape of input or output + int shape[shapeDim]; // Shape of input or output int elem_size; // Element sizeof in bytes int offset; // Offset in bytes within SRAM working data int region; // Scratch region this belongs to diff --git a/backends/arm/scripts/build_executor_runner.sh b/backends/arm/scripts/build_executor_runner.sh index a05287ac4bf..104e3d02a25 100755 --- a/backends/arm/scripts/build_executor_runner.sh +++ b/backends/arm/scripts/build_executor_runner.sh @@ -44,7 +44,7 @@ help() { echo " --memory_mode= Vela memory mode, used for setting the Timing Adapter parameters of the Corstone platforms." echo " Valid values are Shared_Sram(for Ethos-U55, Ethos-U65, Ethos-85), Sram_Only(for Ethos-U55, Ethos-U65, Ethos-U85) or Dedicated_Sram(for Ethos-U65, Ethos-U85)." echo " Default: Shared_Sram for the Ethos-U55 and Sram_Only for the Ethos-U85" - echo " --etdump Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log" + echo " --etdump Adds Devtools etdump support to track timing and output, etdump area will be base64 encoded in the log" echo " --extra_build_flags= Extra flags to pass to cmake like -DET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=60000 Default: none " echo " --output= Output folder Default: /_.pte" echo " --et_build_root= Build output root folder to use, defaults to ${et_build_root}" @@ -161,7 +161,7 @@ if [ "$bundleio" = true ] ; then fi if [ "$build_with_etdump" = true ] ; then - build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=ON " + build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=ON -DET_DUMP_INTERMEDIATE_OUTPUTS=ON " fi echo "Building with BundleIO/etdump/extra flags: ${build_bundleio_flags} ${build_with_etdump_flags} ${extra_build_flags}" diff --git a/backends/arm/scripts/mlsdk_utils.sh b/backends/arm/scripts/mlsdk_utils.sh index f62b9f6d4f0..7a7d2585e52 100755 --- a/backends/arm/scripts/mlsdk_utils.sh +++ b/backends/arm/scripts/mlsdk_utils.sh @@ -7,7 +7,7 @@ set -euo pipefail mlsdk_manifest_url="https://github.com/arm/ai-ml-sdk-manifest.git" -mlsdk_manifest_tag="dev-snapshot-2025-09-12" +mlsdk_manifest_tag="refs/tags/dev-snapshot-2025-09-12" script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) diff --git a/backends/arm/scripts/run_fvp.sh b/backends/arm/scripts/run_fvp.sh index 769b2e30282..0f76d0496de 100755 --- a/backends/arm/scripts/run_fvp.sh +++ b/backends/arm/scripts/run_fvp.sh @@ -21,6 +21,7 @@ elf_file="" data_file="" target="ethos-u55-128" timeout="600" +etrecord_file="" help() { echo "Usage: $(basename $0) [options]" @@ -29,6 +30,7 @@ help() { echo " --data=@
Place a file in memory at this address, useful to emulate a PTE flashed into memory instead as part of the code." echo " --target= Target to build and run for Default: ${target}" echo " --timeout= Maximum target runtime, used to detect hanging, might need to be higer on large models Default: ${timeout}" + echo " --etrecord= If ETDump is used you can supply a ETRecord file matching the PTE" exit 0 } @@ -39,6 +41,7 @@ for arg in "$@"; do --data=*) data_file="--data ${arg#*=}";; --target=*) target="${arg#*=}";; --timeout=*) timeout="${arg#*=}";; + --etrecord=*) etrecord_file="${arg#*=}";; *) ;; esac @@ -115,15 +118,23 @@ echo "Checking for a etdump in log" ! grep "#\[RUN THIS\]" ${log_file} >/dev/null if [ $? != 0 ]; then echo "Found ETDump in log!" + devtools_extra_args="" echo "#!/bin/sh" > etdump_script.sh sed -n '/^#\[RUN THIS\]$/,/^#\[END\]$/p' ${log_file} >> etdump_script.sh # You can run etdump_script.sh if you do # $ chmod a+x etdump_script.sh # $ ./etdump_script.sh # But lets not trust the script as a bad patch would run bad code on your machine - grep ">etdump.bin" etdump_script.sh | cut -d\" -f2- | cut -d\" -f1 >etdump.base64 - base64 -d etdump.base64 >etdump.bin - python3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin --source_time_scale cycles --target_time_scale cycles + grep ">etdump.bin" etdump_script.sh | cut -d\" -f2- | cut -d\" -f1 | base64 -d >etdump.bin + ! grep ">debug_buffer.bin" etdump_script.sh >/dev/null + if [ $? != 0 ]; then + grep ">debug_buffer.bin" etdump_script.sh | cut -d\" -f2- | cut -d\" -f1 | base64 -d >debug_buffer.bin + devtools_extra_args="${devtools_extra_args} --debug_buffer_path debug_buffer.bin" + fi + if [[ ${etrecord_file} != "" ]]; then + devtools_extra_args="${devtools_extra_args} --etrecord_path ${etrecord_file}" + fi + python3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin ${devtools_extra_args} --source_time_scale cycles --target_time_scale cycles fi echo "Checking for problems in log:" diff --git a/backends/arm/test/TARGETS b/backends/arm/test/TARGETS index 8ffad640d5a..ec35b63f8f6 100644 --- a/backends/arm/test/TARGETS +++ b/backends/arm/test/TARGETS @@ -40,8 +40,17 @@ runtime.python_library( ) runtime.python_library( - name = "arm_tester", - srcs = glob(["tester/*.py"]), + name = "arm_tester_serialize", + srcs = ["tester/serialize.py"], + deps = [ + "//executorch/backends/xnnpack/test/tester:tester", + "//executorch/devtools/backend_debug:delegation_info", + ] +) + +runtime.python_library( + name = "arm_tester_lib", + srcs = glob(["tester/*.py"], exclude = ["tester/serialize.py"]), deps = [ ":common", "//executorch/backends/xnnpack/test/tester:tester", @@ -55,4 +64,13 @@ runtime.python_library( ] ) + +runtime.python_library( + name = "arm_tester", + deps = [ + "//executorch/backends/arm/test:arm_tester_lib", + "//executorch/backends/arm/test:arm_tester_serialize", + ] +) + define_arm_tests() diff --git a/backends/arm/test/misc/test_tosa_spec.py b/backends/arm/test/misc/test_tosa_spec.py index 968512f54c6..190c50f4aa1 100644 --- a/backends/arm/test/misc/test_tosa_spec.py +++ b/backends/arm/test/misc/test_tosa_spec.py @@ -5,13 +5,8 @@ import unittest -from executorch.backends.arm.tosa.specification import ( - get_tosa_spec, - Tosa_1_00, - TosaSpecification, -) +from executorch.backends.arm.tosa.specification import Tosa_1_00, TosaSpecification -from executorch.exir.backend.compile_spec_schema import CompileSpec from parameterized import parameterized # type: ignore[import-untyped] test_valid_strings = [ @@ -43,14 +38,6 @@ "TOSA-1.0.0+BF16+fft+int4+cf+INT", ] -test_compile_specs = [ - ([CompileSpec("tosa_spec", "TOSA-1.0.0+INT".encode())],), -] - -test_compile_specs_no_version = [ - ([CompileSpec("other_key", "some_value".encode())],), -] - class TestTosaSpecification(unittest.TestCase): """Tests the TOSA specification class""" @@ -74,19 +61,6 @@ def test_invalid_version_strings(self, version_string: str): assert tosa_spec is None - @parameterized.expand(test_compile_specs) # type: ignore[misc] - def test_create_from_compilespec(self, compile_specs: list[CompileSpec]): - tosa_spec = get_tosa_spec(compile_specs) - assert isinstance(tosa_spec, TosaSpecification) - - @parameterized.expand(test_compile_specs_no_version) # type: ignore[misc] - def test_create_from_invalid_compilespec(self, compile_specs: list[CompileSpec]): - tosa_spec = None - with self.assertRaises(ValueError): - tosa_spec = get_tosa_spec(compile_specs) - - assert tosa_spec is None - @parameterized.expand(test_valid_strings) def test_correct_string_representation(self, version_string: str): tosa_spec = TosaSpecification.create_from_string(version_string) diff --git a/backends/arm/test/ops/test_logical.py b/backends/arm/test/ops/test_logical.py index bb7c5773342..2b160ce7b50 100644 --- a/backends/arm/test/ops/test_logical.py +++ b/backends/arm/test/ops/test_logical.py @@ -86,9 +86,6 @@ def forward(self, tensor: torch.Tensor): ################# -xfails = {"rand_rank4": "MLBEDSW-11031: Output diff on u85 bool transpose."} - - @common.parametrize("test_data", And().test_data) def test_logical_and_tosa_FP(test_data: input_t2): pipeline = TosaPipelineFP[input_t2]( @@ -132,7 +129,7 @@ def test_logical_and_u55_INT_not_delegated(test_data: input_t2): pipeline.run() -@common.parametrize("test_data", And().test_data, xfails=xfails) +@common.parametrize("test_data", And().test_data) @common.XfailIfNoCorstone320 def test_logical_and_u85_INT(test_data: input_t2): pipeline = EthosU85PipelineINT[input_t2]( @@ -226,7 +223,7 @@ def test_logical_xor_u55_INT_not_delegated(test_data: input_t2): pipeline.run() -@common.parametrize("test_data", Xor().test_data, xfails=xfails) +@common.parametrize("test_data", Xor().test_data) @common.XfailIfNoCorstone320 def test_logical_xor_u85_INT(test_data: input_t2): pipeline = EthosU85PipelineINT[input_t2]( @@ -320,7 +317,7 @@ def test_logical_or_u55_INT_not_delegated(test_data: input_t2): pipeline.run() -@common.parametrize("test_data", Or().test_data, xfails=xfails) +@common.parametrize("test_data", Or().test_data) @common.XfailIfNoCorstone320 def test_logical_or_u85_INT(test_data: input_t2): pipeline = EthosU85PipelineINT[input_t2]( @@ -414,7 +411,7 @@ def test_logical_not_u55_INT_not_delegated(test_data: input_t2): pipeline.run() -@common.parametrize("test_data", Not().test_data, xfails=xfails) +@common.parametrize("test_data", Not().test_data) @common.XfailIfNoCorstone320 def test_logical_not_u85_INT(test_data: input_t2): pipeline = EthosU85PipelineINT[input_t2]( diff --git a/backends/arm/test/ops/test_logsoftmax.py b/backends/arm/test/ops/test_logsoftmax.py index 791069aa4b0..c4a68caabac 100644 --- a/backends/arm/test/ops/test_logsoftmax.py +++ b/backends/arm/test/ops/test_logsoftmax.py @@ -64,13 +64,7 @@ def test_log_softmax_tosa_INT(test_data): pipeline.run() -@common.parametrize( - "test_data", - LogSoftmax.test_data, - xfails={ - "randn_neg_dim": "MLBEDSW-11032: ILLEGAL_OFM_BASE error: Base addresses must be aligned to brick depth on u55." - }, -) +@common.parametrize("test_data", LogSoftmax.test_data) @common.XfailIfNoCorstone300() def test_log_softmax_u55_INT(test_data): data, dim = test_data() diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py index dc258f20ec4..6b4455fc702 100644 --- a/backends/arm/test/ops/test_softmax.py +++ b/backends/arm/test/ops/test_softmax.py @@ -61,13 +61,7 @@ def test_softmax_tosa_INT(test_data): pipeline.run() -@common.parametrize( - "test_data", - Softmax.test_data, - { - "randn_neg_dim": "MLBEDSW-11032: ILLEGAL_OFM_BASE error: Base addresses must be aligned to brick depth on u55." - }, -) +@common.parametrize("test_data", Softmax.test_data) @common.XfailIfNoCorstone300 def test_softmax_u55_INT(test_data): data, dim = test_data() diff --git a/backends/arm/test/ops/test_squeeze.py b/backends/arm/test/ops/test_squeeze.py index 5c9f031deec..0de51673496 100644 --- a/backends/arm/test/ops/test_squeeze.py +++ b/backends/arm/test/ops/test_squeeze.py @@ -29,6 +29,7 @@ class SqueezeDim(torch.nn.Module): "squeeze3d_dim_neg_2": lambda: (torch.randn(1, 1, 5), -2), "squeeze4d_dim_pos_3": lambda: (torch.randn(1, 2, 3, 1), 3), "squeeze4d_dim_neg_2": lambda: (torch.randn(1, 5, 1, 5), -2), + "squeeze5d_dim_neg_2": lambda: (torch.randn(1, 1, 5, 1, 5), -2), } def forward(self, x: torch.Tensor, dim: int): @@ -40,6 +41,7 @@ class SqueezeDims(torch.nn.Module): "squeeze3d_dims_0_1": lambda: (torch.randn(1, 1, 5), (0, 1)), "squeeze4d_dims_0_neg_1": lambda: (torch.randn(1, 5, 5, 1), (0, -1)), "squeeze4d_dims_0_neg_2": lambda: (torch.randn(1, 5, 1, 5), (0, -2)), + "squeeze5d_dims_0_neg_2": lambda: (torch.randn(1, 1, 5, 1, 5), (0, -2)), } def forward(self, x: torch.Tensor, dims: tuple[int]): @@ -51,6 +53,7 @@ class Squeeze(torch.nn.Module): "squeeze3d": lambda: (torch.randn(1, 1, 5),), "squeeze4d_dims": lambda: (torch.randn(1, 5, 5, 1),), "squeeze3d_dims_mix": lambda: (torch.randn(1, 5, 1, 5),), + "squeeze4d_dims_mix": lambda: (torch.randn(1, 1, 5, 1, 5),), } def forward(self, x: torch.Tensor): diff --git a/backends/arm/test/ops/test_tanh.py b/backends/arm/test/ops/test_tanh.py index 0e74618fd2f..f3f4df31d0e 100644 --- a/backends/arm/test/ops/test_tanh.py +++ b/backends/arm/test/ops/test_tanh.py @@ -70,25 +70,27 @@ def test_tanh_tosa_INT(test_data: Tuple): @common.parametrize("test_data", test_data_suite) +@common.XfailIfNoCorstone300 def test_tanh_u55_INT(test_data: Tuple): pipeline = EthosU55PipelineINT[input_t1]( Tanh(), (test_data(),), aten_op, exir_ops=[], - run_on_fvp=False, + run_on_fvp=True, ) pipeline.run() @common.parametrize("test_data", test_data_suite) +@common.XfailIfNoCorstone320 def test_tanh_u85_INT(test_data: Tuple): pipeline = EthosU85PipelineINT[input_t1]( Tanh(), (test_data(),), aten_op, exir_ops=[], - run_on_fvp=False, + run_on_fvp=True, ) pipeline.run() diff --git a/backends/arm/test/ops/test_unflatten.py b/backends/arm/test/ops/test_unflatten.py index 95c68b2940d..7f98ababd65 100644 --- a/backends/arm/test/ops/test_unflatten.py +++ b/backends/arm/test/ops/test_unflatten.py @@ -9,6 +9,8 @@ import torch from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.test_pipeline import ( + EthosU55PipelineINT, + EthosU85PipelineINT, TosaPipelineFP, TosaPipelineINT, VgfPipeline, @@ -30,8 +32,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return torch.unflatten(x, self.dim, self.sizes) test_data: dict[str, test_data_t] = { - "randn_4d": (lambda: (Unflatten(1, (2, 2)), (torch.randn(3, 4, 5, 1),))), - "rand_3d": (lambda: (Unflatten(1, (-1, 2)), (torch.rand(3, 4, 4),))), + "rand_3d_batch3": (lambda: (Unflatten(1, (-1, 2)), (torch.rand(3, 4, 4),))), + "rand_3d_batch1": (lambda: (Unflatten(1, (-1, 2)), (torch.rand(1, 4, 4),))), + "randn_4d_dim1": (lambda: (Unflatten(1, (2, 2)), (torch.randn(3, 4, 5, 1),))), + "randn_4d_dim3": (lambda: (Unflatten(3, (2, 2)), (torch.randn(1, 1, 5, 4),))), } @@ -49,7 +53,33 @@ def test_unflatten_int_tosa_FP(test_data: test_data_t): @common.parametrize("test_data", Unflatten.test_data) def test_unflatten_int_tosa_INT(test_data: test_data_t): module, inputs = test_data() - pipeline = TosaPipelineINT[input_t]( + pipeline = TosaPipelineINT[input_t](module, inputs, Unflatten.aten_op) + pipeline.run() + + +xfails = { + "rand_3d_batch3": "Batch size > 1 currently not supported for FVP tests", + "randn_4d_dim1": "Batch size > 1 currently not supported for FVP tests", +} + + +@common.parametrize("test_data", Unflatten.test_data, xfails=xfails, strict=False) +@common.XfailIfNoCorstone300 +def test_unflatten_int_u55_INT(test_data: test_data_t): + module, inputs = test_data() + pipeline = EthosU55PipelineINT[input_t]( + module, + inputs, + Unflatten.aten_op, + ) + pipeline.run() + + +@common.parametrize("test_data", Unflatten.test_data, xfails=xfails, strict=False) +@common.XfailIfNoCorstone320 +def test_unflatten_int_u85_INT(test_data: test_data_t): + module, inputs = test_data() + pipeline = EthosU85PipelineINT[input_t]( module, inputs, Unflatten.aten_op, diff --git a/backends/arm/test/ops/test_unsqueeze.py b/backends/arm/test/ops/test_unsqueeze.py index 54e1b0dd0ce..9da1a352ebb 100644 --- a/backends/arm/test/ops/test_unsqueeze.py +++ b/backends/arm/test/ops/test_unsqueeze.py @@ -25,7 +25,7 @@ class Unsqueeze(torch.nn.Module): - shapes: list[int | Sequence[int]] = [5, (5, 5), (5, 4), (5, 4, 3)] + shapes: list[int | Sequence[int]] = [5, (5, 5), (5, 4), (5, 4, 3), (1, 5, 4, 3)] test_parameters = {} for n in shapes: test_parameters[f"rand_{n}"] = (torch.randn(n),) diff --git a/backends/arm/test/ops/test_view.py b/backends/arm/test/ops/test_view.py index fb0ba54436e..ed942c07aa1 100644 --- a/backends/arm/test/ops/test_view.py +++ b/backends/arm/test/ops/test_view.py @@ -51,6 +51,10 @@ class View(torch.nn.Module): "rand_4d_4_3": lambda: (torch.rand(5, 10, 1, 1), (1, 25, 2)), "rand_4d_4_2": lambda: (torch.rand(2, 50, 1, 1), (1, 100)), "rand_4d_2_4_same": lambda: (torch.rand(2, 3, 2, 3), (2, 3, 3, 2)), + "rand_4d_5d": lambda: (torch.rand(1, 3, 4, 5), (1, 1, 4, 5, -1)), + "rand_5d_5d": lambda: (torch.rand(1, 1, 4, 5, 6), (1, 1, 4, -1, 6)), + "rand_5d_3d": lambda: (torch.rand(1, 1, 4, 5, 6), (2, 3, -1)), + "rand_3d_5d": lambda: (torch.rand(4, 5, 6), (1, 1, 2, -1, 3)), } rank_product_too_large = { diff --git a/backends/arm/test/passes/test_rescale_pass.py b/backends/arm/test/passes/test_rescale_pass.py index 0959a0eaa25..3baa03fde65 100644 --- a/backends/arm/test/passes/test_rescale_pass.py +++ b/backends/arm/test/passes/test_rescale_pass.py @@ -172,14 +172,7 @@ def test_quantized_rescale_tosa_bi(test_data: tuple[torch.Tensor, torch.Tensor]) pipeline.run() -u55_xfails = { - "ones": "MLBEDSW-11032: ILLEGAL_OFM_BASE error: Base addresses must be aligned to brick depth on u55.", - "randn_ones": "MLBEDSW-11032: ILLEGAL_OFM_BASE error: Base addresses must be aligned to brick depth on u55.", - "randn_large": "MLBEDSW-11032: ILLEGAL_OFM_BASE error: Base addresses must be aligned to brick depth on u55.", -} - - -@common.parametrize("test_data", RescaleNetwork.test_data, xfails=u55_xfails) +@common.parametrize("test_data", RescaleNetwork.test_data) @common.XfailIfNoCorstone300 def test_quantized_rescale_u55(test_data: tuple[torch.Tensor, torch.Tensor]): """Tests a model with many ops that requires rescales. As more ops are quantized to int32 and diff --git a/backends/arm/test/targets.bzl b/backends/arm/test/targets.bzl index 62bc5aef57a..f240855cdf4 100644 --- a/backends/arm/test/targets.bzl +++ b/backends/arm/test/targets.bzl @@ -1,4 +1,5 @@ # load("//caffe2/test/fb:defs.bzl", "define_tests") +load("@fbsource//tools/build_defs:fbsource_utils.bzl", "is_fbcode") load("@fbcode_macros//build_defs:python_pytest.bzl", "python_pytest") load("@bazel_skylib//lib:paths.bzl", "paths") @@ -59,7 +60,7 @@ def define_arm_tests(): "//executorch/kernels/quantized:custom_ops_generated_lib", ], deps = [ - "//executorch/backends/arm/test:arm_tester", + "//executorch/backends/arm/test/tester/fb:arm_tester_fb" if is_fbcode else "//executorch/backends/arm/test:arm_tester", "//executorch/backends/arm/test:conftest", "//executorch/backends/arm:ethosu", "//executorch/backends/arm/tosa:compile_spec", diff --git a/backends/arm/test/test_model.py b/backends/arm/test/test_model.py index 8833b7050e7..c336d67ad51 100755 --- a/backends/arm/test/test_model.py +++ b/backends/arm/test/test_model.py @@ -184,7 +184,7 @@ def build_ethosu_runtime( "--build_type=Release", f"--system_config={system_config}", f"--memory_mode={memory_mode}", - f"--extra_build_flags=-DET_DUMP_OUTPUT=OFF {extra_flags}", + f"--extra_build_flags=-DET_LOG_DUMP_OUTPUT=OFF {extra_flags}", f"--output={elf_build_path}", ] ) diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py index 284d4d6d1c4..bb249644c47 100644 --- a/backends/arm/test/tester/arm_tester.py +++ b/backends/arm/test/tester/arm_tester.py @@ -7,7 +7,6 @@ import logging -import os from collections import Counter from pprint import pformat from typing import ( @@ -42,10 +41,7 @@ ) from executorch.backends.arm.test.runner_utils import ( dbg_tosa_fb_to_json, - get_elf_path, get_output_quantization_params, - get_target_board, - run_target, TosaReferenceModelDispatch, ) @@ -53,6 +49,7 @@ dump_error_output, print_error_diffs, ) +from executorch.backends.arm.test.tester.serialize import Serialize from executorch.backends.arm.tosa import TosaSpecification from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec from executorch.backends.arm.tosa.mapping import extract_tensor_meta @@ -90,7 +87,6 @@ from torch.export.graph_signature import ExportGraphSignature, InputSpec, OutputSpec from torch.fx import Graph -from torch.utils._pytree import tree_flatten logger = logging.getLogger(__name__) @@ -179,43 +175,6 @@ def run( ) -class Serialize(tester.Serialize): - def __init__(self, compile_spec: ArmCompileSpec, timeout): - super().__init__() - self.timeout = timeout - self.executorch_program_manager: ExecutorchProgramManager | None - self.compile_spec = compile_spec - - def run(self, artifact: ExecutorchProgramManager, inputs=None) -> None: - super().run(artifact, inputs) - # Keep the entire ExecutorchProgramManager for execution. - self.executorch_program_manager = artifact - - def run_artifact(self, inputs): - if self.executorch_program_manager is None: - raise RuntimeError( - "Tried running artifact from Serialize stage without running the stage." - ) - inputs_flattened, _ = tree_flatten(inputs) - intermediate_path = self.compile_spec.get_intermediate_path() - target_board = get_target_board(self.compile_spec) - elf_path = get_elf_path(target_board) - - if not os.path.exists(elf_path): - raise FileNotFoundError( - f"Did not find build arm_executor_runner in path {elf_path}, run setup_testing.sh?" - ) - - return run_target( - self.executorch_program_manager, - inputs_flattened, - intermediate_path, - target_board, - elf_path, - self.timeout, - ) - - class ToExecutorch(tester.ToExecutorch): def run_artifact(self, inputs): with TosaReferenceModelDispatch(): @@ -303,7 +262,7 @@ def __init__( Args: model (torch.nn.Module): The model to test example_inputs (Tuple[torch.Tensor]): Example inputs to the model - compile_spec (List[CompileSpec]): The compile spec to use + compile_spec (ArmCompileSpec): The compile spec to use """ self.transform_passes = transform_passes @@ -419,7 +378,11 @@ def serialize( self, serialize_stage: Optional[Serialize] = None, timeout: int = 480 ): if serialize_stage is None: - serialize_stage = Serialize(self.compile_spec, timeout) + serialize_stage = Serialize( + compile_spec=self.compile_spec, + module=self.original_module, + timeout=timeout, + ) assert ( self.compile_spec.get_intermediate_path() is not None ), "Can't dump serialized file when compile specs do not contain an artifact path." diff --git a/backends/arm/test/tester/serialize.py b/backends/arm/test/tester/serialize.py new file mode 100644 index 00000000000..f0fd246b3a6 --- /dev/null +++ b/backends/arm/test/tester/serialize.py @@ -0,0 +1,75 @@ +# Copyright 2024-2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import os +from typing import Optional + +import executorch.backends.xnnpack.test.tester.tester as tester + +import torch.fx + +from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec + +from executorch.backends.arm.test.runner_utils import ( + get_elf_path, + get_target_board, + run_target, +) + +from executorch.exir import ExecutorchProgramManager +from torch.utils._pytree import tree_flatten + + +logger = logging.getLogger(__name__) + + +class Serialize(tester.Serialize): + def __init__( + self, + compile_spec: ArmCompileSpec, + module: Optional[torch.nn.Module], + timeout: int = 120, + ): + """ + Args: + compile_spec: CompileSpecs to be used for serialization. + module: Original Module to be used for serialization. Optional - can be used for reference output generation. + timeout: Timeout for fvp. Default is 120 seconds. + """ + super().__init__() + self.module = module + self.timeout = timeout + self.executorch_program_manager: ExecutorchProgramManager | None + self.compile_spec = compile_spec + + def run(self, artifact: ExecutorchProgramManager, inputs=None) -> None: + super().run(artifact, inputs) + # Keep the entire ExecutorchProgramManager for execution. + self.executorch_program_manager = artifact + + def run_artifact(self, inputs): + if self.executorch_program_manager is None: + raise RuntimeError( + "Tried running artifact from Serialize stage without running the stage." + ) + inputs_flattened, _ = tree_flatten(inputs) + intermediate_path = self.compile_spec.get_intermediate_path() + target_board = get_target_board(self.compile_spec) + elf_path = get_elf_path(target_board) + + if not os.path.exists(elf_path): + raise FileNotFoundError( + f"Did not find build arm_executor_runner in path {elf_path}, run setup_testing.sh?" + ) + + return run_target( + self.executorch_program_manager, + inputs_flattened, + intermediate_path, + target_board, + elf_path, + self.timeout, + ) diff --git a/backends/arm/tosa/TARGETS b/backends/arm/tosa/TARGETS index df32689bc3e..51919025591 100644 --- a/backends/arm/tosa/TARGETS +++ b/backends/arm/tosa/TARGETS @@ -6,8 +6,7 @@ runtime.python_library( "mapping.py", ], deps = [ - "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/serializer:serializer", - "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/serializer:serializer", + "fbsource//third-party/tosa_tools:serializer", "//caffe2:torch", ":specification", ], @@ -19,10 +18,8 @@ runtime.python_library( ], deps = [ "fbsource//third-party/pypi/numpy:numpy", - "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/serializer:serializer", - "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/serializer:serializer", - "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/tosa:tosa", - "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/tosa:tosa", + "fbsource//third-party/tosa_tools:serializer", + "fbsource//third-party/tosa_tools:tosa", "//executorch/backends/arm:constants", ":mapping", "//executorch/exir/dialects:lib", @@ -44,7 +41,6 @@ runtime.python_library( "utils.py", ], deps = [ - "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/serializer:serializer", ":quant_utils", "//executorch/backends/arm/operators:node_visitor", ], diff --git a/backends/arm/tosa/backend.py b/backends/arm/tosa/backend.py index 08b0d55aaeb..afae6f8163f 100644 --- a/backends/arm/tosa/backend.py +++ b/backends/arm/tosa/backend.py @@ -24,7 +24,7 @@ process_output, process_placeholder, ) -from executorch.backends.arm.tosa.specification import get_tosa_spec +from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult from executorch.exir.backend.compile_spec_schema import CompileSpec from torch.export.exported_program import ExportedProgram @@ -80,38 +80,24 @@ class TOSABackend(BackendDetails): """ @staticmethod - def preprocess( # noqa: C901 + def preprocess(edge_program: ExportedProgram, compile_specs: List[CompileSpec]): + return TOSABackend._preprocess( + edge_program, TosaCompileSpec.from_list(compile_specs) + ) + + @staticmethod + def _preprocess( # noqa: C901 edge_program: ExportedProgram, - compile_spec: List[CompileSpec], + compile_spec: TosaCompileSpec, ) -> PreprocessResult: # if a debug/test build capture output files from TOSA stage - artifact_path = None - output_format = "" - compile_flags = [] - dump_debug_info = None - for spec in compile_spec: - if spec.key == "debug_artifact_path": - artifact_path = spec.value.decode() - if spec.key == "output_format": - output_format = spec.value.decode() - if spec.key == "compile_flags": - compile_flags.append(spec.value.decode()) - if spec.key == "dump_debug_info": - dump_debug_info = spec.value.decode() - - # Check that the output format is set correctly in the compile spec - if output_format != "tosa": - raise ValueError(f'Invalid output format {output_format}, must be "tosa"') + artifact_path = compile_spec.get_intermediate_path() + tosa_spec = compile_spec.tosa_spec + dump_debug_info = compile_spec.tosa_debug_mode # Assign to every node external id node_2_id = _annotate_external_ids(edge_program.graph) - tosa_spec = get_tosa_spec(compile_spec) - if tosa_spec is None: - raise ValueError( - "TOSA backend needs a TOSA version specified in the CompileSpec" - ) - logger.info(f"Converting ExportedProgram to TOSA: {tosa_spec}") # Converted output for this subgraph, serializer needs path early as it emits @@ -132,7 +118,7 @@ def preprocess( # noqa: C901 debug_hook = None if dump_debug_info is not None: - debug_hook = DebugHook(ArmCompileSpec.DebugMode[dump_debug_info]) + debug_hook = DebugHook(dump_debug_info) # TODO: Fix the need to lazily import this. from executorch.backends.arm.operators.node_visitor import get_node_visitors @@ -204,8 +190,8 @@ def _sort_key(t: Node) -> int: @staticmethod def filter_tosa_compile_specs( - compile_spec: List[CompileSpec], - ) -> List[CompileSpec]: + compile_spec: ArmCompileSpec, + ) -> TosaCompileSpec: """ Filter out the CompileSpec elements relevant for the TOSA backend. This is needed to compose a backend targetting hardware IP with the @@ -214,17 +200,9 @@ def filter_tosa_compile_specs( flatbuffer can then be consumed by the backend targetting specific hardware. """ - tosa_compile_spec = [] - tosa_compile_spec.append(CompileSpec("output_format", "tosa".encode())) - - # Copy everything that's TOSA generic - tosa_backend_compile_spec_keys = [ - "tosa_spec", - "debug_artifact_path", - ] - for spec in compile_spec: - if spec.key in tosa_backend_compile_spec_keys: - tosa_compile_spec.append(CompileSpec(spec.key, spec.value)) - - return tosa_compile_spec + new_compile_spec = TosaCompileSpec.__new__(TosaCompileSpec) + new_compile_spec._set_compile_specs( + compile_spec.tosa_spec, [], compile_spec.get_intermediate_path() + ) + return new_compile_spec diff --git a/backends/arm/tosa/partitioner.py b/backends/arm/tosa/partitioner.py index ab381470968..3e512847109 100644 --- a/backends/arm/tosa/partitioner.py +++ b/backends/arm/tosa/partitioner.py @@ -65,6 +65,7 @@ def __init__( self.delegation_spec = DelegationSpec( TOSABackend.__name__, compile_spec.to_list() ) + self.tosa_spec = compile_spec.tosa_spec self.additional_checks = additional_checks self.tosa_spec = compile_spec.tosa_spec @@ -75,13 +76,13 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: # no logger.info("TOSAPartitioner::partition") partition_tags: dict[str, DelegationSpec] = {} - tosa_spec = self.tosa_spec - - logger.info(f"Partitioning for {self.delegation_spec.backend_id}: {tosa_spec}") + logger.info( + f"Partitioning for {self.delegation_spec.backend_id}: {self.tosa_spec}" + ) reporter = WhyNoPartitionReporter() operator_support = tosa_support_factory( - tosa_spec, exported_program, reporter, self.additional_checks + self.tosa_spec, exported_program, reporter, self.additional_checks ) capability_partitioner = CapabilityBasedPartitioner( exported_program.graph_module, @@ -131,7 +132,7 @@ def is_partitioned(node: torch.fx.Node, tag=tag) -> bool: break continue - if tosa_spec.support_float(): + if self.tosa_spec.support_float(): continue if is_partitioned(node): @@ -163,7 +164,7 @@ def is_partitioned(node: torch.fx.Node, tag=tag) -> bool: ) tag_constant_data(exported_program) - logger.info(f"The following nodes were rejected for {tosa_spec}:") + logger.info(f"The following nodes were rejected for {self.tosa_spec}:") logger.info("\n" + reporter.get_table_report()) logger.info("(Placeholders and outputs are not included in this list)") return PartitionResult( @@ -213,8 +214,7 @@ def filter_fn(node: torch.fx.Node) -> bool: torch.ops.aten.logit.default, ] + ops_to_not_decompose_if_quant_op - tosa_spec = self.tosa_spec - if not tosa_spec.is_U55_subset: + if not self.tosa_spec.is_U55_subset: # Tosa operator "RESIZE" is not supported on U55. Since upsample_bilinear2d # and upsample_nearest2d decompose into that it will not be possible to # delegate those operators on U55. If we have said here to not decompose diff --git a/backends/arm/tosa/specification.py b/backends/arm/tosa/specification.py index 92b68955cdd..b372cd5a636 100644 --- a/backends/arm/tosa/specification.py +++ b/backends/arm/tosa/specification.py @@ -15,10 +15,6 @@ import re from typing import List -from executorch.exir.backend.compile_spec_schema import ( # type: ignore[import-not-found] - CompileSpec, -) - from packaging.version import Version @@ -199,10 +195,3 @@ def get_context_spec() -> TosaSpecification: return TosaLoweringContext.tosa_spec_var.get() except LookupError: raise RuntimeError("Function must be executed within a TosaLoweringContext") - - -def get_tosa_spec(compile_spec: List[CompileSpec]) -> TosaSpecification: - for spec in compile_spec: - if spec.key == "tosa_spec": - return TosaSpecification.create_from_string(spec.value.decode()) - raise ValueError("Could not find TOSA version in CompileSpec") diff --git a/backends/arm/vgf/backend.py b/backends/arm/vgf/backend.py index 7c408748529..3f65456bf8b 100644 --- a/backends/arm/vgf/backend.py +++ b/backends/arm/vgf/backend.py @@ -22,6 +22,7 @@ arm_get_first_delegation_tag, TOSABackend, ) +from executorch.backends.arm.vgf.compile_spec import VgfCompileSpec from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult from executorch.exir.backend.compile_spec_schema import CompileSpec from torch.export.exported_program import ExportedProgram @@ -40,21 +41,15 @@ class VgfBackend(BackendDetails): @staticmethod def _compile_tosa_flatbuffer( tosa_flatbuffer: bytes, - compile_spec: List[CompileSpec], + compile_spec: VgfCompileSpec, tag_name: str = "", ) -> bytes: """ Static helper method to do the compilation of the TOSA flatbuffer representation to a target specific binary stream. """ - compile_flags = [] - artifact_path = None - for spec in compile_spec: - if spec.key == "compile_flags": - compile_flags.append(spec.value.decode()) - if spec.key == "debug_artifact_path": - artifact_path = spec.value.decode() - + compile_flags = compile_spec.compiler_flags + artifact_path = compile_spec.get_intermediate_path() # Pass on the TOSA flatbuffer to the vgf compiler. binary = vgf_compile(tosa_flatbuffer, compile_flags, artifact_path, tag_name) return binary @@ -62,10 +57,11 @@ def _compile_tosa_flatbuffer( @staticmethod def preprocess( edge_program: ExportedProgram, - compile_spec: List[CompileSpec], + compile_specs: List[CompileSpec], ) -> PreprocessResult: logger.info(f"{VgfBackend.__name__} preprocess") + compile_spec = VgfCompileSpec.from_list(compile_specs) # deduce TOSA compile_spec from VGF compile spec. We get a new # compile spec list, containing only elements relevant for the # TOSABackend. @@ -75,7 +71,7 @@ def preprocess( # ('All backend implementation are final...'), so use composition instead. # preprocess returns the serialized TOSA flatbuffer in .processed_bytes, # which can be passed on to next compilation step. - tosa_preprocess = TOSABackend.preprocess(edge_program, tosa_compile_spec) + tosa_preprocess = TOSABackend._preprocess(edge_program, tosa_compile_spec) tag_name = arm_get_first_delegation_tag(edge_program.graph_module) diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS index 27f9c00f4ac..d547a1ed555 100644 --- a/backends/cadence/aot/TARGETS +++ b/backends/cadence/aot/TARGETS @@ -344,6 +344,7 @@ python_unittest( typing = True, deps = [ ":ops_registrations", + ":typing_stubs", ":type_dispatch", "//caffe2:torch", "//executorch/backends/cadence/aot:graph_builder", diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml index 196480931e0..c8e7d6cb3fc 100644 --- a/backends/cadence/aot/functions.yaml +++ b/backends/cadence/aot/functions.yaml @@ -359,6 +359,26 @@ - arg_meta: null kernel_name: impl::reference::quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out +- func: cadence::quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::reference::quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out + +- func: cadence::quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::reference::quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out + +- func: cadence::quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::reference::quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out + +- func: cadence::quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::reference::quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out + - func: cadence::quantized_fully_connected.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index cf4c5a8fffb..1b62c215ab6 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -370,6 +370,26 @@ - arg_meta: null kernel_name: cadence::impl::HiFi::quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out +- func: cadence::quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out + +- func: cadence::quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out + +- func: cadence::quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out + +- func: cadence::quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out + - func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py index 35b4cbf3902..efb22a9e7d6 100644 --- a/backends/cadence/aot/ops_registrations.py +++ b/backends/cadence/aot/ops_registrations.py @@ -169,6 +169,30 @@ lib.define( "quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) +lib.define( + "quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" +) +lib.define( + "quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" +) +lib.define( + "quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" +) +lib.define( + "quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" +) +lib.define( + "quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" +) +lib.define( + "quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" +) +lib.define( + "quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" +) +lib.define( + "quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" +) lib.define( "quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" ) @@ -2153,6 +2177,150 @@ def roi_align_box_processor_meta( return rois.new_empty((rois.shape[0], 80), dtype=torch.uint8) +@register_fake("cadence::quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor") +def quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_meta( + input: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + stride: Tuple[int], + padding: Tuple[int], + dilation: Tuple[int], + groups: int, + in_zero_point: int, + weight_zero_point: int, + bias_scale: float, + output_scale: float, + output_zero_point: int, + out_multiplier: int, + out_shift: int, +) -> torch.Tensor: + assert input.dim() == 3 and weight.dim() == 3 + assert ( + input.dtype == torch.int8 + and weight.dtype == torch.int8 + and bias.dtype == torch.int32 + ) + out_channels, _, kernel_size = weight.shape + output_size = get_conv1d_output_size( + input.shape, + out_channels, + stride[1], + padding[1], + dilation[1], + kernel_size, + False, + ) + return input.new_empty(output_size, dtype=input.dtype) + + +@register_fake("cadence::quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor") +def quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_meta( + input: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + stride: Tuple[int], + padding: Tuple[int], + dilation: Tuple[int], + groups: int, + in_zero_point: int, + weight_zero_point: int, + bias_scale: float, + output_scale: float, + output_zero_point: int, + out_multiplier: int, + out_shift: int, +) -> torch.Tensor: + assert input.dim() == 3 and weight.dim() == 3 + assert ( + input.dtype == torch.uint8 + and weight.dtype == torch.uint8 + and bias.dtype == torch.int32 + ) + out_channels, _, kernel_size = weight.shape + output_size = get_conv1d_output_size( + input.shape, + out_channels, + stride[1], + padding[1], + dilation[1], + kernel_size, + False, + ) + return input.new_empty(output_size, dtype=input.dtype) + + +@register_fake("cadence::quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor") +def quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_meta( + input: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + stride: Tuple[int], + padding: Tuple[int], + dilation: Tuple[int], + groups: int, + in_zero_point: int, + weight_zero_point: int, + bias_scale: float, + output_scale: float, + output_zero_point: int, + out_multiplier: int, + out_shift: int, +) -> torch.Tensor: + assert input.dim() == 3 and weight.dim() == 3 + assert ( + input.dtype == torch.int8 + and weight.dtype == torch.int8 + and bias.dtype == torch.int32 + ) + out_channels, kernel_size, _ = weight.shape + output_size = get_conv1d_output_size( + input.shape, + out_channels, + stride[1], + padding[1], + dilation[1], + kernel_size, + True, + ) + return input.new_empty(output_size, dtype=input.dtype) + + +@register_fake("cadence::quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor") +def quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_meta( + input: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + stride: Tuple[int], + padding: Tuple[int], + dilation: Tuple[int], + groups: int, + in_zero_point: int, + weight_zero_point: int, + bias_scale: float, + output_scale: float, + output_zero_point: int, + out_multiplier: int, + out_shift: int, +) -> torch.Tensor: + assert input.dim() == 3 and weight.dim() == 3 + assert ( + input.dtype == torch.uint8 + and weight.dtype == torch.uint8 + and bias.dtype == torch.int32 + ) + out_channels, kernel_size, _ = weight.shape + output_size = get_conv1d_output_size( + input.shape, + out_channels, + stride[1], + padding[1], + dilation[1], + kernel_size, + True, + ) + return input.new_empty(output_size, dtype=input.dtype) + + @register_fake("cadence::_softmax_f32_f32") def softmax_f32_f32_meta( self: torch.Tensor, diff --git a/backends/cadence/aot/quantizer/fusion_pass.py b/backends/cadence/aot/quantizer/fusion_pass.py index 729056ea2c8..8f106a815ac 100644 --- a/backends/cadence/aot/quantizer/fusion_pass.py +++ b/backends/cadence/aot/quantizer/fusion_pass.py @@ -15,7 +15,11 @@ BmmPattern, CatPattern, Conv1dPattern, + Conv1dReluPattern0, + Conv1dReluPattern1, Conv2dPattern, + Conv2dReluPattern0, + Conv2dReluPattern1, LayerNormPattern, LinearPattern, MatmulPattern, @@ -23,6 +27,7 @@ ReluPattern1, ) from executorch.backends.cadence.aot.quantizer.utils import ( + check_out_zero_point_is_min_range, create_zero_bias_int32, find_sequential_partitions_aten, get_conv_args, @@ -41,6 +46,13 @@ # Use this part for patterns with multiple aten ops ReluPatterns = (ReluPattern0, ReluPattern1) +ConvPatterns = (Conv1dPattern, Conv2dPattern) +ConvReluPatterns = ( + Conv1dReluPattern0, + Conv1dReluPattern1, + Conv2dReluPattern0, + Conv2dReluPattern1, +) def get_args_and_kwargs_add( @@ -432,12 +444,12 @@ def call(self, graph_module: fx.GraphModule) -> PassResult: # noqa: C901 other_inputs = [node.args[idx] for node, idx in anchors.others] # The node is the first index of the list and first of the tuple - op_node = anchors.output[0][0] + anchor_output_node = anchors.output[0][0] - assert len(op_node.users) == 1 - quant_node = list(op_node.users.keys())[0] + assert len(anchor_output_node.users) == 1 + quant_node = list(anchor_output_node.users.keys())[0] - with graph_module.graph.inserting_after(op_node): + with graph_module.graph.inserting_after(anchor_output_node): args = tuple( inputs_inputs + weights_inputs + other_inputs + bias_inputs ) @@ -451,9 +463,29 @@ def call(self, graph_module: fx.GraphModule) -> PassResult: # noqa: C901 ) elif isinstance(pattern, CatPattern): args, kwargs = get_args_and_kwargs_cat( - inputs_inputs, other_inputs, op_node + inputs_inputs, other_inputs, anchor_output_node + ) + elif isinstance(pattern, ConvReluPatterns): + # For ConvReLU, we are fusing Conv+ReLU + # This means that the op we want to get + # the replacement args and kwargs for is the + # *conv* op, which is the anchor input, NOT + # the anchor output (which is the ReLU) + check_out_zero_point_is_min_range( + quant_node.args[2], quant_node.args[5] + ) + anchor_input_node = anchors.inputs[0][0] + args, kwargs = get_args_and_kwargs_conv( + graph_module, + inputs_inputs, + dequants_inputs, + weights_inputs, + dequants_weights, + bias_inputs, + quant_node, + anchor_input_node, ) - elif isinstance(pattern, (Conv1dPattern, Conv2dPattern)): + elif isinstance(pattern, ConvPatterns): args, kwargs = get_args_and_kwargs_conv( graph_module, inputs_inputs, @@ -462,7 +494,7 @@ def call(self, graph_module: fx.GraphModule) -> PassResult: # noqa: C901 dequants_weights, bias_inputs, quant_node, - op_node, + anchor_output_node, ) elif isinstance(pattern, LinearPattern): args, kwargs = get_args_and_kwargs_linear( diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py index 74987f8b38d..b653be27e8f 100644 --- a/backends/cadence/aot/quantizer/patterns.py +++ b/backends/cadence/aot/quantizer/patterns.py @@ -417,3 +417,71 @@ def partition_types(self) -> List[OpOverload]: class ReluPattern1(ReluBasePattern): def partition_types(self) -> List[OpOverload]: return [torch.ops.aten.relu_.default] + + +# This is a base class for Conv+ReLU fusion, since it can be used with two different relu aten ops +class ConvReluBasePattern(QuantizationPattern): + @abstractmethod + def partition_types(self) -> List[OpOverload]: + pass + + def get_anchors( + self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule] + ) -> PartitionAnchors: + # The first node should be conv, the second should be relu + # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... + conv_node = fused_partition[0].nodes[-1] # Second to last node + # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge... + relu_node = fused_partition[1].nodes[-1] # Last node + + bias_qspec = DerivedQuantizationSpec( + derived_from=[ + (conv_node.args[0], conv_node), + (conv_node.args[1], conv_node), + ], + derive_qparams_fn=get_bias_qparams, + dtype=torch.int32, + quant_min=-(2**31), + quant_max=2**31 - 1, + qscheme=torch.per_tensor_affine, + ) + + # Keep bias empty if not supplied + bias = [] + if len(conv_node.args) > 2 and conv_node.args[2] is not None: + bias = [(conv_node, 2, bias_qspec)] + + return PartitionAnchors( + inputs=[(conv_node, 0)], + weights=[(conv_node, 1)], + # pyre-fixme[6]: Incompatible parameter type + biases=bias, + output=[(relu_node,)], # Output is from the relu node + ) + + def replacement_op(self) -> OpOverload: + return torch.ops.cadence.quantized_conv_nchw.default + + +# Conv1d + regular relu op fusion +class Conv1dReluPattern0(ConvReluBasePattern): + def partition_types(self) -> List[OpOverload]: + return [torch.ops.aten.conv1d.default, torch.ops.aten.relu.default] + + +# Conv1d + alternate relu op fusion +class Conv1dReluPattern1(ConvReluBasePattern): + def partition_types(self) -> List[OpOverload]: + return [torch.ops.aten.conv1d.default, torch.ops.aten.relu_.default] + + +# Conv2d + regular relu op fusion +class Conv2dReluPattern0(ConvReluBasePattern): + def partition_types(self) -> List[OpOverload]: + return [torch.ops.aten.conv2d.default, torch.ops.aten.relu.default] + + +# Conv2d + alternate relu op fusion +class Conv2dReluPattern1(ConvReluBasePattern): + def partition_types(self) -> List[OpOverload]: + return [torch.ops.aten.conv2d.default, torch.ops.aten.relu_.default] diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py index 8c78ac87e58..cce7c207a6b 100644 --- a/backends/cadence/aot/quantizer/quantizer.py +++ b/backends/cadence/aot/quantizer/quantizer.py @@ -16,7 +16,11 @@ BmmPattern, CatPattern, Conv1dPattern, + Conv1dReluPattern0, + Conv1dReluPattern1, Conv2dPattern, + Conv2dReluPattern0, + Conv2dReluPattern1, LayerNormPattern, LinearPattern, MatmulPattern, @@ -260,3 +264,22 @@ def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None: quantizers.append(CadenceAtenQuantizer(AddPattern(), qconfig_A8W8)) quantizers.append(CadenceAtenQuantizer(CatPattern(), qconfig_A8W8)) super().__init__(quantizers) + + +class CadenceFusedConvReluQuantizer(CadenceQuantizer): + """ + Quantizer using fused conv+relu patterns, and including add and cat + """ + + def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None: + if quantizers is None: + quantizers = [] + # Order matters here, perform the "fused" patterns first + quantizers.append(CadenceAtenQuantizer(Conv1dReluPattern0(), qconfig_A8W8sym)) + quantizers.append(CadenceAtenQuantizer(Conv1dReluPattern1(), qconfig_A8W8sym)) + quantizers.append(CadenceAtenQuantizer(Conv2dReluPattern0(), qconfig_A8W8sym)) + quantizers.append(CadenceAtenQuantizer(Conv2dReluPattern1(), qconfig_A8W8sym)) + quantizers = quantizers + get_cadence_default_quantizers() + quantizers.append(CadenceAtenQuantizer(AddPattern(), qconfig_A8W8)) + quantizers.append(CadenceAtenQuantizer(CatPattern(), qconfig_A8W8)) + super().__init__(quantizers) diff --git a/backends/cadence/aot/quantizer/utils.py b/backends/cadence/aot/quantizer/utils.py index beacd1b9e86..68fc6740cb4 100644 --- a/backends/cadence/aot/quantizer/utils.py +++ b/backends/cadence/aot/quantizer/utils.py @@ -234,3 +234,19 @@ def find_sequential_partitions_aten( if _partitions_sequential(candidate): fused_partitions.append(candidate) return fused_partitions + + +def check_out_zero_point_is_min_range( + out_zero_point: int, + out_dtype: torch.dtype, +) -> bool: + """ + Checks if the out_zero_point is the minimum range of the quant type. + """ + if out_dtype == torch.int8: + return out_zero_point == -128 + elif out_dtype == torch.int16: + return out_zero_point == -32768 + elif out_dtype == torch.uint8 or torch.uint16: + return out_zero_point == 0 + return False diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py index 40ae6d23085..2a53c2dde7a 100644 --- a/backends/cadence/aot/ref_implementations.py +++ b/backends/cadence/aot/ref_implementations.py @@ -127,14 +127,14 @@ def dequantize_per_tensor( return (input_tensor - zero_point).to(dtype) * scale -@impl(m, "quantized_add") -def quantized_add( +@impl(m, "quantized_add.per_tensor") +def quantized_add_per_tensor( X: torch.Tensor, - X_scale: torch.Tensor, - X_zero_point: torch.Tensor, + X_scale: float, + X_zero_point: int, Y: torch.Tensor, - Y_scale: torch.Tensor, - Y_zero_point: torch.Tensor, + Y_scale: float, + Y_zero_point: int, out_scale: float, out_zero_point: int, ) -> torch.Tensor: @@ -149,17 +149,17 @@ def quantized_add( out = (X_scale(X - X_zero_point) + Y_scale(Y - Y_zero_point)) / out_scale + out_zero_point Args: - - X (Tensor): The first operand - - X_scale (Tensor): The ratio between the sizes of X's floating point and quantized + - X: The first operand + - X_scale: The ratio between the sizes of X's floating point and quantized ranges - - X_zero_point (Tensor): The quantized mapping of zero for X - - Y (Tensor): The second operand - - Y_scale (Tensor): The ratio between the sizes of Y's floating point and quantized + - X_zero_point: The quantized mapping of zero for X + - Y: The second operand + - Y_scale: The ratio between the sizes of Y's floating point and quantized ranges - - Y_zero_point (Tensor): The quantized mapping of zero for Y - - out_scale (float): The ratio between the sizes of the output's floating point and + - Y_zero_point: The quantized mapping of zero for Y + - out_scale: The ratio between the sizes of the output's floating point and quantized ranges - - out_zero_point (int): The quantized mapping of zero for the output + - out_zero_point: The quantized mapping of zero for the output """ supported_dtypes = [torch.int8, torch.uint8] if X.dtype != Y.dtype: @@ -193,13 +193,55 @@ def quantized_add( ) +@impl(m, "quantized_add_asym8sxasym8s_asym8s.per_tensor") +def quantized_add_asym8sxasym8s_asym8s_per_tensor( + X: torch.Tensor, + X_scale: float, + X_zero_point: int, + Y: torch.Tensor, + Y_scale: float, + Y_zero_point: int, + out_scale: float, + out_zero_point: int, +) -> torch.Tensor: + if X.dtype != torch.int8: + raise ValueError("X dtype must be torch.int8") + if Y.dtype != torch.int8: + raise ValueError("Y dtype must be torch.int8") + + return quantized_add_per_tensor( + X, X_scale, X_zero_point, Y, Y_scale, Y_zero_point, out_scale, out_zero_point + ) + + +@impl(m, "quantized_add_asym8uxasym8u_asym8u.per_tensor") +def quantized_add_asym8uxasym8u_asym8u_per_tensor( + X: torch.Tensor, + X_scale: float, + X_zero_point: int, + Y: torch.Tensor, + Y_scale: float, + Y_zero_point: int, + out_scale: float, + out_zero_point: int, +) -> torch.Tensor: + if X.dtype != torch.uint8: + raise ValueError("X dtype must be torch.int8") + if Y.dtype != torch.uint8: + raise ValueError("Y dtype must be torch.int8") + + return quantized_add_per_tensor( + X, X_scale, X_zero_point, Y, Y_scale, Y_zero_point, out_scale, out_zero_point + ) + + def quantized_linear_common( src: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, in_zero_point: int, weight_zero_point: torch.Tensor | int, - out_multiplier: torch.Tensor | int, + out_multiplier: int, out_shift: int, out_zero_point: int, ) -> torch.Tensor: @@ -287,34 +329,30 @@ def variant( assert isinstance(weight_zero_point, int) assert isinstance(out_multiplier, int) assert isinstance(out_shift, int) - return quantized_linear_common( - src, - weight, - bias, - in_zero_point, - weight_zero_point, - out_multiplier, - out_shift, - out_zero_point, - ) + _out_shift = out_shift + _out_multiplier = out_multiplier else: assert isinstance(out_shift, torch.Tensor) + assert isinstance(out_multiplier, torch.Tensor) if out_shift.numel() != 1: raise ValueError("out_shift must be a scalar") if out_shift.dtype != torch.int64: raise ValueError("out_shift must be an int64") - return quantized_linear_common( - src, - weight, - bias, - in_zero_point, - weight_zero_point, - out_multiplier, - int(out_shift.item()), - out_zero_point, - ) + _out_shift = int(out_shift.item()) + _out_multiplier = int(out_multiplier[0].item()) + + return quantized_linear_common( + src, + weight, + bias, + in_zero_point, + weight_zero_point, + _out_multiplier, + _out_shift, + out_zero_point, + ) return variant @@ -361,6 +399,112 @@ def quantized_fully_connected_asym8sxasym8s_asym8s_per_tensor() -> torch.Tensor: def quantized_fully_connected_asym8uxasym8u_asym8u_per_tensor() -> torch.Tensor: ... +@impl(m, "quantized_matmul") +def quantized_matmul( + X: torch.Tensor, + X_zero_point: int, + Y: torch.Tensor, + Y_zero_point: int, + bias: torch.Tensor | None, + out_multiplier: int, + out_shift: int, + out_zero_point: int, + transposed: bool = False, +) -> torch.Tensor: + """ + Quantized matmul operation. + + Args: + - X (Tensor): The activations tensor + - X_zero_point (int): The quantized mapping of zero for the input + - Y (Tensor): The weight tensor + - Y_zero_point (int): The quantized mapping of zero for the weight + - bias (Tensor): The bias tensor + - out_multiplier (int): The multiplier used to scale the output + - out_shift (int): The shift used to scale the output + - out_zero_point (int): The quantized mapping of zero for the output + - transposed (bool): Whether to transpose the weight tensor + """ + if bias is not None and not torch.all(bias == 0): + raise ValueError("bias must be None or all zeros since unused in out variant") + + # Looks weird, but quantized linear assumes weights are pre-transposed, + # hence we transpose only if `transposed` is False. + if not transposed: + Y = Y.T + + return quantized_linear_common( + X, + Y, + bias or torch.zeros(1, dtype=torch.int32), + X_zero_point, + Y_zero_point, + out_multiplier, + out_shift, + out_zero_point, + ) + + +@impl(m, "quantized_matmul_asym8sxasym8s_asym8s") +def quantized_matmul_asym8sxasym8s_asym8s( + X: torch.Tensor, + X_zero_point: int, + Y: torch.Tensor, + Y_zero_point: int, + bias: torch.Tensor | None, + out_multiplier: int, + out_shift: int, + out_zero_point: int, + transposed: bool = False, +) -> torch.Tensor: + if X.dtype != torch.int8: + raise ValueError("X dtype must be torch.int8") + if Y.dtype != torch.int8: + raise ValueError("Y dtype must be torch.int8") + + return quantized_matmul( + X, + X_zero_point, + Y, + Y_zero_point, + bias, + out_multiplier, + out_shift, + out_zero_point, + transposed, + ) + + +@impl(m, "quantized_matmul_asym8uxasym8u_asym8u") +def quantized_matmul_asym8uxasym8u_asym8u( + X: torch.Tensor, + X_zero_point: int, + Y: torch.Tensor, + Y_zero_point: int, + bias: torch.Tensor | None, + out_multiplier: int, + out_shift: int, + out_zero_point: int, + transposed: bool = False, +) -> torch.Tensor: + if X.dtype != torch.uint8: + raise ValueError("X dtype must be torch.uint8") + if Y.dtype != torch.uint8: + raise ValueError("Y dtype must be torch.uint8") + + return quantized_matmul( + X, + X_zero_point, + Y, + Y_zero_point, + bias, + out_multiplier, + out_shift, + out_zero_point, + transposed, + ) + + @impl(m, "quantized_layer_norm.per_tensor") def quantized_layer_norm_per_tensor( input_tensor: torch.Tensor, @@ -613,6 +757,7 @@ def quantized_conv_variant( layout: str, input_dtype: torch.dtype, weight_dtype: torch.dtype, + is_1d: bool = False, ) -> Callable[[Callable[..., torch.Tensor]], Callable[..., torch.Tensor]]: """Create a quantized conv variant with type checking.""" @@ -644,6 +789,14 @@ def variant( bias.dtype == torch.int32 ), f"Expected bias dtype int32, got {bias.dtype}" + if is_1d: + assert ( + len(input_tensor.shape) == 3 + ), f"1D convolution requires 3D input tensor, got {len(input_tensor.shape)}D" + assert ( + len(weight.shape) == 3 + ), f"1D convolution requires 3D weight tensor, got {len(weight.shape)}D" + # Call the appropriate base function match layout: case "nchw": @@ -748,6 +901,26 @@ def quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor() -> torch.Tens def quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ... +@impl(m, "quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor") +@quantized_conv_variant("nchw", torch.int8, torch.int8, is_1d=True) +def quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ... + + +@impl(m, "quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor") +@quantized_conv_variant("nchw", torch.uint8, torch.uint8, is_1d=True) +def quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ... + + +@impl(m, "quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor") +@quantized_conv_variant("nhwc", torch.int8, torch.int8, is_1d=True) +def quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ... + + +@impl(m, "quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor") +@quantized_conv_variant("nhwc", torch.uint8, torch.uint8, is_1d=True) +def quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ... + + def quantized_relu_common( X: torch.Tensor, X_zero_point: torch.Tensor | int, diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py index 04b3e8e75ba..30b30e085dc 100644 --- a/backends/cadence/aot/tests/test_ref_implementations.py +++ b/backends/cadence/aot/tests/test_ref_implementations.py @@ -100,7 +100,7 @@ def test_dequantize_per_tensor( [ # Only these types need to be tested as per ET_FORALL_JARVIS_QUANTIZED_TYPES in # on_device_ai/Assistant/Jarvis/min_runtime/operators/generic/operators.h - ("int16", 5, 0.8, 4, 5, 0.8, 4, 0.8, 4, 6, torch.int8), + ("int8", 5, 0.8, 4, 5, 0.8, 4, 0.8, 4, 6, torch.int8), ("uint8", 5, 0.8, 4, 5, 0.8, 4, 0.8, 4, 6, torch.uint8), ] ) @@ -122,13 +122,34 @@ def test_quantized_add( Y_tensor = torch.tensor([Y], dtype=dtype) expected_output = torch.tensor([expected_value], dtype=dtype) + quantized_add = ( + torch.ops.cadence.quantized_add_asym8sxasym8s_asym8s.per_tensor + if dtype == torch.int8 + else torch.ops.cadence.quantized_add_asym8uxasym8u_asym8u.per_tensor + ) + output = quantized_add( + X_tensor, + X_scale, + X_zero_point, + Y_tensor, + Y_scale, + Y_zero_point, + out_scale, + out_zero_point, + ) + + self.assertTrue( + torch.equal(output, expected_output), + f"Values don't match in {name}: got {output}, expected {expected_output}", + ) + output = torch.ops.cadence.quantized_add( X_tensor, - torch.tensor(X_scale), - torch.tensor(X_zero_point, dtype=dtype), + X_scale, + X_zero_point, Y_tensor, - torch.tensor(Y_scale), - torch.tensor(Y_zero_point, dtype=dtype), + Y_scale, + Y_zero_point, out_scale, out_zero_point, ) @@ -156,6 +177,8 @@ def test_quantized_add( 0, # out_zero_point torch.tensor([[-2]], dtype=dtype), # expected_output per_tensor, + False, + False, ) for (per_tensor, dtype) in ( (False, torch.int8), @@ -179,6 +202,8 @@ def test_quantized_add( 0, # out_zero_point torch.tensor([[-10, -30]], dtype=dtype), # expected_output per_tensor, + False, + False, ) for (per_tensor, dtype) in ( (False, torch.int8), @@ -204,6 +229,8 @@ def test_quantized_add( [[[-2, -8, -14], [-6, -28, -50]]], dtype=dtype ), # expected_output per_tensor, + False, + False, ) for (per_tensor, dtype) in ( (False, torch.int8), @@ -227,6 +254,8 @@ def test_quantized_add( 1, # out_zero_point torch.tensor([[-15, 25]], dtype=dtype), # expected_output per_tensor, + False, + False, ) for (per_tensor, dtype) in ( (False, torch.int8), @@ -250,6 +279,8 @@ def test_quantized_add( 1, # out_zero_point torch.tensor([[-23, 17]], dtype=dtype), # expected_output False, + False, + False, ) for dtype in (torch.int8, torch.uint8) ], @@ -271,9 +302,34 @@ def test_quantized_add( 1, # out_zero_point torch.tensor([[-7, 13]], dtype=dtype), # expected_output per_tensor, + False, + False, ) for (per_tensor, dtype) in ((False, torch.int8), (True, torch.int8)) ], + *[ + ( + torch.Size([1, 2]), # src_shape: 1 sample, 2 input features + torch.Size( + [2, 2] + ), # weight_shape: 2 output features, 2 input features + 2, # in_zero_point + torch.tensor([1, 1], dtype=dtype), # weight_zero_point + torch.tensor( + [268435456], dtype=torch.int32 + ), # out_multiplier (0.125 * 2^31) + torch.tensor( + [1], dtype=torch.int64 + ), # out_shift (shift=1, doubles the scale) + 1, # out_zero_point + torch.tensor([[-7, 17]], dtype=dtype), # expected_output + per_tensor, + matmul, + transposed_matmul, + ) + for (matmul, transposed_matmul) in ((True, False), (True, True)) + for (per_tensor, dtype) in ((True, torch.int8), (True, torch.uint8)) + ], ] ) def test_quantized_linear( @@ -287,7 +343,12 @@ def test_quantized_linear( out_zero_point: int, expected_output: torch.Tensor, per_tensor: bool, + matmul: bool, + transposed_matmul: bool, ) -> None: + if not per_tensor and matmul: + self.skipTest("Only per_tensor supported for matmul") + src = ( torch.arange(np.prod(src_shape)) .reshape(src_shape) @@ -298,7 +359,9 @@ def test_quantized_linear( .reshape(weight_shape) .to(expected_output.dtype) ) - bias = torch.arange(weight_shape[0]).to(torch.int32) + if matmul and not transposed_matmul: + weight = weight.T + if per_tensor: weight_zero_point = weight_zero_point[0] out_multiplier = out_multiplier[0] @@ -307,20 +370,34 @@ def test_quantized_linear( if per_tensor: match expected_output.dtype: case torch.int8: - linear_ops = ( - torch.ops.cadence.quantized_linear_asym8sxasym8s_asym8s.per_tensor, - torch.ops.cadence.quantized_fully_connected_asym8sxasym8s_asym8s.per_tensor, - ) + if matmul: + linear_ops = ( + # Doesn't have per tensor name, but it is per tensor + torch.ops.cadence.quantized_matmul_asym8sxasym8s_asym8s, + ) + else: + linear_ops = ( + torch.ops.cadence.quantized_linear_asym8sxasym8s_asym8s.per_tensor, + torch.ops.cadence.quantized_fully_connected_asym8sxasym8s_asym8s.per_tensor, + ) case torch.uint8: - linear_ops = ( - torch.ops.cadence.quantized_linear_asym8uxasym8u_asym8u.per_tensor, - torch.ops.cadence.quantized_fully_connected_asym8uxasym8u_asym8u.per_tensor, - ) + if matmul: + linear_ops = ( + torch.ops.cadence.quantized_matmul_asym8uxasym8u_asym8u, + ) + else: + linear_ops = ( + torch.ops.cadence.quantized_linear_asym8uxasym8u_asym8u.per_tensor, + torch.ops.cadence.quantized_fully_connected_asym8uxasym8u_asym8u.per_tensor, + ) case _: - linear_ops = ( - torch.ops.cadence.quantized_linear.per_tensor, - torch.ops.cadence.quantized_fully_connected.per_tensor, - ) + if matmul: + linear_ops = (torch.ops.cadence.quantized_matmul,) + else: + linear_ops = ( + torch.ops.cadence.quantized_linear.per_tensor, + torch.ops.cadence.quantized_fully_connected.per_tensor, + ) else: linear_ops = ( torch.ops.cadence.quantized_linear, @@ -328,17 +405,40 @@ def test_quantized_linear( ) for linear_op in linear_ops: - output = linear_op( - src, - weight, - bias, - in_zero_point, - weight_zero_point, - out_multiplier, - out_shift, - out_zero_point, - typing.cast(torch.Tensor, None), + # Get the function name for linear_op for debugging + op_name = ( + linear_op.__name__ if hasattr(linear_op, "__name__") else str(linear_op) ) + if matmul: + assert "quantized_matmul" in op_name + output = linear_op( + src, + in_zero_point, + weight, + weight_zero_point, + None, + out_multiplier, + out_shift, + out_zero_point, + transposed_matmul, + ) + else: + assert ( + "quantized_linear" in op_name + or "quantized_fully_connected" in op_name + ) + bias = torch.arange(weight_shape[0]).to(torch.int32) + output = linear_op( + src, + weight, + bias, + in_zero_point, + weight_zero_point, + out_multiplier, + out_shift, + out_zero_point, + typing.cast(torch.Tensor, None), + ) self.assertTrue(output.dtype == expected_output.dtype, "Dtype mismatch") diff --git a/backends/cadence/aot/tests/test_type_dispatch_passes.py b/backends/cadence/aot/tests/test_type_dispatch_passes.py index 52904aecb41..4ae10ea83dd 100644 --- a/backends/cadence/aot/tests/test_type_dispatch_passes.py +++ b/backends/cadence/aot/tests/test_type_dispatch_passes.py @@ -13,41 +13,36 @@ from executorch.backends.cadence.aot.graph_builder import single_op_builder from executorch.backends.cadence.aot.pass_utils import count_node from executorch.backends.cadence.aot.type_dispatch import CompileTimeTypeDispatchPass +from executorch.backends.cadence.aot.typing_stubs import expand from executorch.exir.dialects._ops import ops as exir_ops from torch.fx.passes.infra.pass_base import PassResult class TestTypeDispatchPasses(unittest.TestCase): - def test_int8_dispatch_quantized_fully_connected(self) -> None: - """Test int8 x int8 inputs should dispatch to asym8sxasym8s_asym8s variant""" - x = torch.randint(-128, 127, (1, 3), dtype=torch.int8) - w = torch.randint(-128, 127, (4, 3), dtype=torch.int8) - b = torch.randint(-2147483648, 2147483647, (4,), dtype=torch.int32) - gm = single_op_builder( - placeholders=(x, w, b), - op=exir_ops.edge.cadence.quantized_fully_connected.per_tensor, - args=(x, w, b, 0, 0, 1, 0, 0, None), - ) - p = CompileTimeTypeDispatchPass() - gm = cast(PassResult, p(gm)).graph_module - # Original op should be replaced - self.assertEqual( - count_node(gm, exir_ops.edge.cadence.quantized_fully_connected.per_tensor), - 0, - ) - # Should be replaced with int8 specific variant - self.assertEqual( - count_node( - gm, + @expand( + [ + ( + "int8", + torch.int8, exir_ops.edge.cadence.quantized_fully_connected_asym8sxasym8s_asym8s.per_tensor, ), - 1, - ) - - def test_uint8_dispatch_quantized_fully_connected(self) -> None: - """Test uint8 x uint8 inputs should dispatch to asym8uxasym8u_asym8u variant""" - x = torch.randint(0, 255, (1, 3), dtype=torch.uint8) - w = torch.randint(0, 255, (4, 3), dtype=torch.uint8) + ( + "uint8", + torch.uint8, + exir_ops.edge.cadence.quantized_fully_connected_asym8uxasym8u_asym8u.per_tensor, + ), + ] + ) + def test_dispatch_quantized_fully_connected( + self, + _: str, + dtype: torch.dtype, + expected_op: torch._ops.OpOverload, + ) -> None: + """Test quantized_fully_connected dispatches to correct dtype-specific variant""" + min_val, max_val = torch.iinfo(dtype).min, torch.iinfo(dtype).max + x = torch.randint(min_val, max_val, (1, 3), dtype=dtype) + w = torch.randint(min_val, max_val, (4, 3), dtype=dtype) b = torch.randint(-2147483648, 2147483647, (4,), dtype=torch.int32) gm = single_op_builder( placeholders=(x, w, b), @@ -61,45 +56,33 @@ def test_uint8_dispatch_quantized_fully_connected(self) -> None: count_node(gm, exir_ops.edge.cadence.quantized_fully_connected.per_tensor), 0, ) - # Should be replaced with uint8 specific variant - self.assertEqual( - count_node( - gm, - exir_ops.edge.cadence.quantized_fully_connected_asym8uxasym8u_asym8u.per_tensor, - ), - 1, - ) + # Should be replaced with dtype-specific variant + self.assertEqual(count_node(gm, expected_op), 1) - def test_int8_dispatch_quantized_linear(self) -> None: - """Test int8 x int8 inputs should dispatch to asym8sxasym8s_asym8s variant for quantized_linear""" - x = torch.randint(-128, 127, (2, 3), dtype=torch.int8) - w = torch.randint(-128, 127, (4, 3), dtype=torch.int8) - b = torch.randint(-2147483648, 2147483647, (4,), dtype=torch.int32) - gm = single_op_builder( - placeholders=(x, w, b), - op=exir_ops.edge.cadence.quantized_linear.per_tensor, - args=(x, w, b, 0, 0, 1, 0, 0, None), - ) - p = CompileTimeTypeDispatchPass() - gm = cast(PassResult, p(gm)).graph_module - # Original op should be replaced - self.assertEqual( - count_node(gm, exir_ops.edge.cadence.quantized_linear.per_tensor), - 0, - ) - # Should be replaced with int8 specific variant - self.assertEqual( - count_node( - gm, + @expand( + [ + ( + "int8", + torch.int8, exir_ops.edge.cadence.quantized_linear_asym8sxasym8s_asym8s.per_tensor, ), - 1, - ) - - def test_uint8_quantized_linear_dispatch(self) -> None: - """Test uint8 x uint8 inputs should dispatch to asym8uxasym8u_asym8u variant for quantized_linear""" - x = torch.randint(0, 255, (2, 3), dtype=torch.uint8) - w = torch.randint(0, 255, (4, 3), dtype=torch.uint8) + ( + "uint8", + torch.uint8, + exir_ops.edge.cadence.quantized_linear_asym8uxasym8u_asym8u.per_tensor, + ), + ] + ) + def test_dispatch_quantized_linear( + self, + _: str, + dtype: torch.dtype, + expected_op: torch._ops.OpOverload, + ) -> None: + """Test quantized_linear dispatches to correct dtype-specific variant""" + min_val, max_val = torch.iinfo(dtype).min, torch.iinfo(dtype).max + x = torch.randint(min_val, max_val, (2, 3), dtype=dtype) + w = torch.randint(min_val, max_val, (4, 3), dtype=dtype) b = torch.randint(-2147483648, 2147483647, (4,), dtype=torch.int32) gm = single_op_builder( placeholders=(x, w, b), @@ -113,14 +96,8 @@ def test_uint8_quantized_linear_dispatch(self) -> None: count_node(gm, exir_ops.edge.cadence.quantized_linear.per_tensor), 0, ) - # Should be replaced with uint8 specific variant - self.assertEqual( - count_node( - gm, - exir_ops.edge.cadence.quantized_linear_asym8uxasym8u_asym8u.per_tensor, - ), - 1, - ) + # Should be replaced with dtype-specific variant + self.assertEqual(count_node(gm, expected_op), 1) def test_mixed_types_error(self) -> None: """Test mixed int8/uint8 inputs should raise RuntimeError""" @@ -138,33 +115,29 @@ def test_mixed_types_error(self) -> None: cast(PassResult, p(gm)).graph_module self.assertIn("Unsupported input types", str(context.exception)) - def test_int8_dispatch_quantized_relu(self) -> None: - """Test int8 input should dispatch to asym8s_asym8s variant for quantized_relu""" - x = torch.randint(-128, 127, (2, 3), dtype=torch.int8) - gm = single_op_builder( - placeholders=(x,), - op=exir_ops.edge.cadence.quantized_relu.per_tensor, - args=(x, 0, 0, 1, 0), - ) - p = CompileTimeTypeDispatchPass() - gm = cast(PassResult, p(gm)).graph_module - # Original op should be replaced - self.assertEqual( - count_node(gm, exir_ops.edge.cadence.quantized_relu.per_tensor), - 0, - ) - # Should be replaced with int8 specific variant - self.assertEqual( - count_node( - gm, + @expand( + [ + ( + "int8", + torch.int8, exir_ops.edge.cadence.quantized_relu_asym8s_asym8s.per_tensor, ), - 1, - ) - - def test_uint8_dispatch_quantized_relu(self) -> None: - """Test uint8 input should dispatch to asym8u_asym8u variant for quantized_relu""" - x = torch.randint(0, 255, (2, 3), dtype=torch.uint8) + ( + "uint8", + torch.uint8, + exir_ops.edge.cadence.quantized_relu_asym8u_asym8u.per_tensor, + ), + ] + ) + def test_dispatch_quantized_relu( + self, + _: str, + dtype: torch.dtype, + expected_op: torch._ops.OpOverload, + ) -> None: + """Test quantized_relu dispatches to correct dtype-specific variant""" + min_val, max_val = torch.iinfo(dtype).min, torch.iinfo(dtype).max + x = torch.randint(min_val, max_val, (2, 3), dtype=dtype) gm = single_op_builder( placeholders=(x,), op=exir_ops.edge.cadence.quantized_relu.per_tensor, @@ -177,45 +150,33 @@ def test_uint8_dispatch_quantized_relu(self) -> None: count_node(gm, exir_ops.edge.cadence.quantized_relu.per_tensor), 0, ) - # Should be replaced with uint8 specific variant - self.assertEqual( - count_node( - gm, - exir_ops.edge.cadence.quantized_relu_asym8u_asym8u.per_tensor, - ), - 1, - ) + # Should be replaced with dtype-specific variant + self.assertEqual(count_node(gm, expected_op), 1) - def test_int8_dispatch_quantized_matmul(self) -> None: - """Test int8 x int8 inputs should dispatch to asym8sxasym8s_asym8s variant for quantized_matmul""" - x = torch.randint(-128, 127, (2, 3), dtype=torch.int8) - y = torch.randint(-128, 127, (3, 4), dtype=torch.int8) - bias = torch.randint(-2147483648, 2147483647, (4,), dtype=torch.int32) - gm = single_op_builder( - placeholders=(x, y, bias), - op=exir_ops.edge.cadence.quantized_matmul.default, - args=(x, 0, y, 0, bias, 1, 0, 0, False), - ) - p = CompileTimeTypeDispatchPass() - gm = cast(PassResult, p(gm)).graph_module - # Original op should be replaced - self.assertEqual( - count_node(gm, exir_ops.edge.cadence.quantized_matmul.default), - 0, - ) - # Should be replaced with int8 specific variant - self.assertEqual( - count_node( - gm, + @expand( + [ + ( + "int8", + torch.int8, exir_ops.edge.cadence.quantized_matmul_asym8sxasym8s_asym8s.default, ), - 1, - ) - - def test_uint8_dispatch_quantized_matmul(self) -> None: - """Test uint8 x uint8 inputs should dispatch to asym8uxasym8u_asym8u variant for quantized_matmul""" - x = torch.randint(0, 255, (2, 3), dtype=torch.uint8) - y = torch.randint(0, 255, (3, 4), dtype=torch.uint8) + ( + "uint8", + torch.uint8, + exir_ops.edge.cadence.quantized_matmul_asym8uxasym8u_asym8u.default, + ), + ] + ) + def test_dispatch_quantized_matmul( + self, + _: str, + dtype: torch.dtype, + expected_op: torch._ops.OpOverload, + ) -> None: + """Test quantized_matmul dispatches to correct dtype-specific variant""" + min_val, max_val = torch.iinfo(dtype).min, torch.iinfo(dtype).max + x = torch.randint(min_val, max_val, (2, 3), dtype=dtype) + y = torch.randint(min_val, max_val, (3, 4), dtype=dtype) bias = torch.randint(-2147483648, 2147483647, (4,), dtype=torch.int32) gm = single_op_builder( placeholders=(x, y, bias), @@ -229,252 +190,204 @@ def test_uint8_dispatch_quantized_matmul(self) -> None: count_node(gm, exir_ops.edge.cadence.quantized_matmul.default), 0, ) - # Should be replaced with uint8 specific variant - self.assertEqual( - count_node( - gm, - exir_ops.edge.cadence.quantized_matmul_asym8uxasym8u_asym8u.default, - ), - 1, - ) + # Should be replaced with dtype-specific variant + self.assertEqual(count_node(gm, expected_op), 1) - def test_int8_dispatch_quantized_conv_nchw(self) -> None: - """Test int8 x int8 inputs should dispatch to asym8sxasym8s_asym8s variant for quantized_conv_nchw""" - x = torch.randint(-128, 127, (1, 3, 8, 8), dtype=torch.int8) - w = torch.randint(-128, 127, (16, 3, 3, 3), dtype=torch.int8) - b = torch.randint(-2147483648, 2147483647, (16,), dtype=torch.int32) - gm = single_op_builder( - placeholders=(x, w, b), - op=exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, - args=(x, w, b, [1, 1], [0, 0], [1, 1], 1, 0, 0, 1.0, 1.0, 0, 1, 1), - ) - p = CompileTimeTypeDispatchPass() - gm = cast(PassResult, p(gm)).graph_module - # Original op should be replaced - self.assertEqual( - count_node(gm, exir_ops.edge.cadence.quantized_conv_nchw.per_tensor), - 0, - ) - # Should be replaced with int8 specific variant - self.assertEqual( - count_node( - gm, + @expand( + [ + ( + "int8_nchw", + torch.int8, + (1, 3, 8, 8), # x_shape + exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, exir_ops.edge.cadence.quantized_conv_nchw_asym8sxsym8s_asym8s.per_tensor, ), - 1, - ) - - def test_uint8_dispatch_quantized_conv_nchw(self) -> None: - """Test uint8 x uint8 inputs should dispatch to asym8uxasym8u_asym8u variant for quantized_conv_nchw""" - x = torch.randint(0, 255, (1, 3, 8, 8), dtype=torch.uint8) - w = torch.randint(0, 255, (16, 3, 3, 3), dtype=torch.uint8) - b = torch.randint(-2147483648, 2147483647, (16,), dtype=torch.int32) - gm = single_op_builder( - placeholders=(x, w, b), - op=exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, - args=(x, w, b, [1, 1], [0, 0], [1, 1], 1, 0, 0, 1.0, 1.0, 0, 1, 1), - ) - p = CompileTimeTypeDispatchPass() - gm = cast(PassResult, p(gm)).graph_module - # Original op should be replaced - self.assertEqual( - count_node(gm, exir_ops.edge.cadence.quantized_conv_nchw.per_tensor), - 0, - ) - # Should be replaced with uint8 specific variant - self.assertEqual( - count_node( - gm, + ( + "uint8_nchw", + torch.uint8, + (1, 3, 8, 8), # x_shape + exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, exir_ops.edge.cadence.quantized_conv_nchw_asym8uxsym8u_asym8u.per_tensor, ), - 1, - ) - - def test_int8_dispatch_quantized_conv_nhwc(self) -> None: - """Test int8 x int8 inputs should dispatch to asym8sxasym8s_asym8s variant for quantized_conv_nhwc""" - x = torch.randint(-128, 127, (1, 8, 8, 3), dtype=torch.int8) - w = torch.randint(-128, 127, (16, 3, 3, 3), dtype=torch.int8) - b = torch.randint(-2147483648, 2147483647, (16,), dtype=torch.int32) - gm = single_op_builder( - placeholders=(x, w, b), - op=exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, - args=(x, w, b, [1, 1], [0, 0], [1, 1], 1, 0, 0, 1.0, 1.0, 0, 1, 1), - ) - p = CompileTimeTypeDispatchPass() - gm = cast(PassResult, p(gm)).graph_module - # Original op should be replaced - self.assertEqual( - count_node(gm, exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor), - 0, - ) - # Should be replaced with int8 specific variant - self.assertEqual( - count_node( - gm, + ( + "int8_nhwc", + torch.int8, + (1, 8, 8, 3), # x_shape + exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, exir_ops.edge.cadence.quantized_conv_nhwc_asym8sxsym8s_asym8s.per_tensor, ), - 1, - ) - - def test_uint8_dispatch_quantized_conv_nhwc(self) -> None: - """Test uint8 x uint8 inputs should dispatch to asym8uxasym8u_asym8u variant for quantized_conv_nhwc""" - x = torch.randint(0, 255, (1, 8, 8, 3), dtype=torch.uint8) - w = torch.randint(0, 255, (16, 3, 3, 3), dtype=torch.uint8) - b = torch.randint(-2147483648, 2147483647, (16,), dtype=torch.int32) - gm = single_op_builder( - placeholders=(x, w, b), - op=exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, - args=(x, w, b, [1, 1], [0, 0], [1, 1], 1, 0, 0, 1.0, 1.0, 0, 1, 1), - ) - p = CompileTimeTypeDispatchPass() - gm = cast(PassResult, p(gm)).graph_module - # Original op should be replaced - self.assertEqual( - count_node(gm, exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor), - 0, - ) - # Should be replaced with uint8 specific variant - self.assertEqual( - count_node( - gm, + ( + "uint8_nhwc", + torch.uint8, + (1, 8, 8, 3), # x_shape + exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, exir_ops.edge.cadence.quantized_conv_nhwc_asym8uxsym8u_asym8u.per_tensor, ), - 1, - ) - - def test_int8_dispatch_quantized_conv_nchw_dilated(self) -> None: - """Test int8 x int8 inputs with dilation should dispatch to dilated_asym8sxasym8s_asym8s variant for quantized_conv_nchw_dilated""" - x = torch.randint(-128, 127, (1, 3, 8, 8), dtype=torch.int8) - w = torch.randint(-128, 127, (16, 3, 3, 3), dtype=torch.int8) + ] + ) + def test_dispatch_quantized_conv_2d( + self, + _: str, + dtype: torch.dtype, + x_shape: tuple[int, ...], + original_op: torch._ops.OpOverload, + expected_op: torch._ops.OpOverload, + ) -> None: + """Test quantized_conv_2d (nchw/nhwc) dispatches to correct dtype-specific variant""" + min_val, max_val = torch.iinfo(dtype).min, torch.iinfo(dtype).max + x = torch.randint(min_val, max_val, x_shape, dtype=dtype) + w = torch.randint(min_val, max_val, (16, 3, 3, 3), dtype=dtype) b = torch.randint(-2147483648, 2147483647, (16,), dtype=torch.int32) gm = single_op_builder( placeholders=(x, w, b), - op=exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, - args=(x, w, b, [1, 1], [0, 0], [2, 2], 1, 0, 0, 1.0, 1.0, 0, 1, 1), + op=original_op, + args=(x, w, b, [1, 1], [0, 0], [1, 1], 1, 0, 0, 1.0, 1.0, 0, 1, 1), ) p = CompileTimeTypeDispatchPass() gm = cast(PassResult, p(gm)).graph_module # Original op should be replaced - self.assertEqual( - count_node(gm, exir_ops.edge.cadence.quantized_conv_nchw.per_tensor), - 0, - ) - # Should be replaced with int8 specific variant - self.assertEqual( - count_node( - gm, + self.assertEqual(count_node(gm, original_op), 0) + # Should be replaced with dtype-specific variant + self.assertEqual(count_node(gm, expected_op), 1) + + @expand( + [ + ( + "int8_nchw_dilated", + torch.int8, + (1, 3, 8, 8), # x_shape + exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, exir_ops.edge.cadence.quantized_conv_nchw_dilated_asym8sxsym8s_asym8s.per_tensor, ), - 1, - ) - - def test_uint8_dispatch_quantized_conv_nchw_dilated(self) -> None: - """Test uint8 x uint8 inputs with dilation should dispatch to dilated_asym8uxasym8u_asym8u variant for quantized_conv_nchw""" - x = torch.randint(0, 255, (1, 3, 8, 8), dtype=torch.uint8) - w = torch.randint(0, 255, (16, 3, 3, 3), dtype=torch.uint8) - b = torch.randint(-2147483648, 2147483647, (16,), dtype=torch.int32) - gm = single_op_builder( - placeholders=(x, w, b), - op=exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, - args=(x, w, b, [1, 1], [0, 0], [2, 2], 1, 0, 0, 1.0, 1.0, 0, 1, 1), - ) - p = CompileTimeTypeDispatchPass() - gm = cast(PassResult, p(gm)).graph_module - # Original op should be replaced - self.assertEqual( - count_node(gm, exir_ops.edge.cadence.quantized_conv_nchw.per_tensor), - 0, - ) - # Should be replaced with uint8 specific variant - self.assertEqual( - count_node( - gm, + ( + "uint8_nchw_dilated", + torch.uint8, + (1, 3, 8, 8), # x_shape + exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, exir_ops.edge.cadence.quantized_conv_nchw_dilated_asym8uxsym8u_asym8u.per_tensor, ), - 1, - ) - - def test_int8_dispatch_quantized_conv_nhwc_dilated(self) -> None: - """Test int8 x int8 inputs with dilation should dispatch to dilated_asym8sxasym8s_asym8s variant for quantized_conv_nhwc""" - x = torch.randint(-128, 127, (1, 8, 8, 3), dtype=torch.int8) - w = torch.randint(-128, 127, (16, 3, 3, 3), dtype=torch.int8) + ( + "int8_nhwc_dilated", + torch.int8, + (1, 8, 8, 3), # x_shape + exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, + exir_ops.edge.cadence.quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor, + ), + ( + "uint8_nhwc_dilated", + torch.uint8, + (1, 8, 8, 3), # x_shape + exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, + exir_ops.edge.cadence.quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor, + ), + ] + ) + def test_dispatch_quantized_conv_2d_dilated( + self, + _: str, + dtype: torch.dtype, + x_shape: tuple[int, ...], + original_op: torch._ops.OpOverload, + expected_op: torch._ops.OpOverload, + ) -> None: + """Test quantized_conv_2d with dilation dispatches to correct dtype-specific variant""" + min_val, max_val = torch.iinfo(dtype).min, torch.iinfo(dtype).max + x = torch.randint(min_val, max_val, x_shape, dtype=dtype) + w = torch.randint(min_val, max_val, (16, 3, 3, 3), dtype=dtype) b = torch.randint(-2147483648, 2147483647, (16,), dtype=torch.int32) gm = single_op_builder( placeholders=(x, w, b), - op=exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, + op=original_op, args=(x, w, b, [1, 1], [0, 0], [2, 2], 1, 0, 0, 1.0, 1.0, 0, 1, 1), ) p = CompileTimeTypeDispatchPass() gm = cast(PassResult, p(gm)).graph_module # Original op should be replaced - self.assertEqual( - count_node(gm, exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor), - 0, - ) - # Should be replaced with int8 specific variant - self.assertEqual( - count_node( - gm, - exir_ops.edge.cadence.quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor, - ), - 1, - ) + self.assertEqual(count_node(gm, original_op), 0) + # Should be replaced with dtype-specific variant + self.assertEqual(count_node(gm, expected_op), 1) - def test_uint8_dispatch_quantized_conv_nhwc_dilated(self) -> None: - """Test uint8 x uint8 inputs with dilation should dispatch to dilated_asym8uxasym8u_asym8u variant for quantized_conv_nhwc""" - x = torch.randint(0, 255, (1, 8, 8, 3), dtype=torch.uint8) - w = torch.randint(0, 255, (16, 3, 3, 3), dtype=torch.uint8) + @expand( + [ + ( + "int8_nchw_1d", + torch.int8, + (1, 3, 8), # x_shape + exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, + exir_ops.edge.cadence.quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor, + ), + ( + "uint8_nchw_1d", + torch.uint8, + (1, 3, 8), # x_shape + exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, + exir_ops.edge.cadence.quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor, + ), + ( + "int8_nhwc_1d", + torch.int8, + (1, 8, 3), # x_shape + exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, + exir_ops.edge.cadence.quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor, + ), + ( + "uint8_nhwc_1d", + torch.uint8, + (1, 8, 3), # x_shape + exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, + exir_ops.edge.cadence.quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor, + ), + ] + ) + def test_dispatch_quantized_conv_1d( + self, + _: str, + dtype: torch.dtype, + x_shape: tuple[int, ...], + original_op: torch._ops.OpOverload, + expected_op: torch._ops.OpOverload, + ) -> None: + """Test quantized_conv_1d (nchw/nhwc) dispatches to correct dtype-specific variant""" + min_val, max_val = torch.iinfo(dtype).min, torch.iinfo(dtype).max + x = torch.randint(min_val, max_val, x_shape, dtype=dtype) + w = torch.randint(min_val, max_val, (16, 3, 3), dtype=dtype) b = torch.randint(-2147483648, 2147483647, (16,), dtype=torch.int32) gm = single_op_builder( placeholders=(x, w, b), - op=exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, - args=(x, w, b, [1, 1], [0, 0], [2, 2], 1, 0, 0, 1.0, 1.0, 0, 1, 1), + op=original_op, + args=(x, w, b, [1, 1], [0, 0], [1, 1], 1, 0, 0, 1.0, 1.0, 0, 1, 1), ) p = CompileTimeTypeDispatchPass() gm = cast(PassResult, p(gm)).graph_module # Original op should be replaced - self.assertEqual( - count_node(gm, exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor), - 0, - ) - # Should be replaced with uint8 specific variant - self.assertEqual( - count_node( - gm, - exir_ops.edge.cadence.quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor, - ), - 1, - ) + self.assertEqual(count_node(gm, original_op), 0) + # Should be replaced with dtype-specific variant + self.assertEqual(count_node(gm, expected_op), 1) - def test_int8_dispatch_quantized_add(self) -> None: - """Test int8 x int8 inputs should dispatch to asym8sxasym8s_asym8s variant for quantized_add""" - x = torch.randint(-128, 127, (2, 3), dtype=torch.int8) - y = torch.randint(-128, 127, (2, 3), dtype=torch.int8) - gm = single_op_builder( - placeholders=(x, y), - op=exir_ops.edge.cadence.quantized_add.per_tensor, - args=(x, 1.0, 0, y, 1.0, 0, 1.0, 0), - ) - p = CompileTimeTypeDispatchPass() - gm = cast(PassResult, p(gm)).graph_module - # Original op should be replaced - self.assertEqual( - count_node(gm, exir_ops.edge.cadence.quantized_add.per_tensor), - 0, - ) - # Should be replaced with int8 specific variant - self.assertEqual( - count_node( - gm, + @expand( + [ + ( + "int8", + torch.int8, exir_ops.edge.cadence.quantized_add_asym8sxasym8s_asym8s.per_tensor, ), - 1, - ) - - def test_uint8_dispatch_quantized_add(self) -> None: - """Test uint8 x uint8 inputs should dispatch to asym8uxasym8u_asym8u variant for quantized_add""" - x = torch.randint(0, 255, (2, 3), dtype=torch.uint8) - y = torch.randint(0, 255, (2, 3), dtype=torch.uint8) + ( + "uint8", + torch.uint8, + exir_ops.edge.cadence.quantized_add_asym8uxasym8u_asym8u.per_tensor, + ), + ] + ) + def test_dispatch_quantized_add( + self, + _: str, + dtype: torch.dtype, + expected_op: torch._ops.OpOverload, + ) -> None: + """Test quantized_add dispatches to correct dtype-specific variant""" + min_val, max_val = torch.iinfo(dtype).min, torch.iinfo(dtype).max + x = torch.randint(min_val, max_val, (2, 3), dtype=dtype) + y = torch.randint(min_val, max_val, (2, 3), dtype=dtype) gm = single_op_builder( placeholders=(x, y), op=exir_ops.edge.cadence.quantized_add.per_tensor, @@ -487,158 +400,62 @@ def test_uint8_dispatch_quantized_add(self) -> None: count_node(gm, exir_ops.edge.cadence.quantized_add.per_tensor), 0, ) - # Should be replaced with uint8 specific variant - self.assertEqual( - count_node( - gm, - exir_ops.edge.cadence.quantized_add_asym8uxasym8u_asym8u.per_tensor, - ), - 1, - ) + # Should be replaced with dtype-specific variant + self.assertEqual(count_node(gm, expected_op), 1) - def test_int8_dispatch_quantized_conv_nchw_depthwise(self) -> None: - """Test int8 x int8 inputs with depthwise should dispatch to depthwise_asym8sxsym8s_asym8s variant for quantized_conv_nchw""" - # Depthwise convolution: groups == input_channels - x = torch.randint(-128, 127, (1, 3, 8, 8), dtype=torch.int8) - w = torch.randint( - -128, 127, (3, 1, 3, 3), dtype=torch.int8 - ) # groups=3, input_channels=3 - b = torch.randint(-2147483648, 2147483647, (3,), dtype=torch.int32) - gm = single_op_builder( - placeholders=(x, w, b), - op=exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, - args=( - x, - w, - b, - [1, 1], - [0, 0], - [1, 1], - 3, - 0, - 0, - 1.0, - 1.0, - 0, - 1, - 1, - ), # groups=3 - ) - p = CompileTimeTypeDispatchPass() - gm = cast(PassResult, p(gm)).graph_module - # Original op should be replaced - self.assertEqual( - count_node(gm, exir_ops.edge.cadence.quantized_conv_nchw.per_tensor), - 0, - ) - # Should be replaced with int8 depthwise specific variant - self.assertEqual( - count_node( - gm, + @expand( + [ + ( + "int8_nchw_depthwise", + torch.int8, + (1, 3, 8, 8), # x_shape + (3, 1, 3, 3), # w_shape (groups=3, input_channels=3) + exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, exir_ops.edge.cadence.quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor, ), - 1, - ) - - def test_uint8_dispatch_quantized_conv_nchw_depthwise(self) -> None: - """Test uint8 x uint8 inputs with depthwise should dispatch to depthwise_asym8uxasym8u_asym8u variant for quantized_conv_nchw""" - # Depthwise convolution: groups == input_channels - x = torch.randint(0, 255, (1, 3, 8, 8), dtype=torch.uint8) - w = torch.randint( - 0, 255, (3, 1, 3, 3), dtype=torch.uint8 - ) # groups=3, input_channels=3 - b = torch.randint(-2147483648, 2147483647, (3,), dtype=torch.int32) - gm = single_op_builder( - placeholders=(x, w, b), - op=exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, - args=( - x, - w, - b, - [1, 1], - [0, 0], - [1, 1], - 3, - 0, - 0, - 1.0, - 1.0, - 0, - 1, - 1, - ), # groups=3 - ) - p = CompileTimeTypeDispatchPass() - gm = cast(PassResult, p(gm)).graph_module - # Original op should be replaced - self.assertEqual( - count_node(gm, exir_ops.edge.cadence.quantized_conv_nchw.per_tensor), - 0, - ) - # Should be replaced with uint8 depthwise specific variant - self.assertEqual( - count_node( - gm, + ( + "uint8_nchw_depthwise", + torch.uint8, + (1, 3, 8, 8), # x_shape + (3, 1, 3, 3), # w_shape (groups=3, input_channels=3) + exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, exir_ops.edge.cadence.quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor, ), - 1, - ) - - def test_int8_dispatch_quantized_conv_nhwc_depthwise(self) -> None: - """Test int8 x int8 inputs with depthwise should dispatch to depthwise_asym8sxsym8s_asym8s variant for quantized_conv_nhwc""" - # Depthwise convolution: groups == input_channels - x = torch.randint(-128, 127, (1, 8, 8, 3), dtype=torch.int8) - w = torch.randint( - -128, 127, (3, 3, 3, 1), dtype=torch.int8 - ) # groups=3, input_channels=3 - b = torch.randint(-2147483648, 2147483647, (3,), dtype=torch.int32) - gm = single_op_builder( - placeholders=(x, w, b), - op=exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, - args=( - x, - w, - b, - [1, 1], - [0, 0], - [1, 1], - 3, - 0, - 0, - 1.0, - 1.0, - 0, - 1, - 1, - ), # groups=3 - ) - p = CompileTimeTypeDispatchPass() - gm = cast(PassResult, p(gm)).graph_module - # Original op should be replaced - self.assertEqual( - count_node(gm, exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor), - 0, - ) - # Should be replaced with int8 depthwise specific variant - self.assertEqual( - count_node( - gm, + ( + "int8_nhwc_depthwise", + torch.int8, + (1, 8, 8, 3), # x_shape + (3, 3, 3, 1), # w_shape (groups=3, input_channels=3) + exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, exir_ops.edge.cadence.quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor, ), - 1, - ) - - def test_uint8_dispatch_quantized_conv_nhwc_depthwise(self) -> None: - """Test uint8 x uint8 inputs with depthwise should dispatch to depthwise_asym8uxasym8u_asym8u variant for quantized_conv_nhwc""" - # Depthwise convolution: groups == input_channels - x = torch.randint(0, 255, (1, 8, 8, 3), dtype=torch.uint8) - w = torch.randint( - 0, 255, (3, 3, 3, 1), dtype=torch.uint8 - ) # groups=3, input_channels=3 + ( + "uint8_nhwc_depthwise", + torch.uint8, + (1, 8, 8, 3), # x_shape + (3, 3, 3, 1), # w_shape (groups=3, input_channels=3) + exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, + exir_ops.edge.cadence.quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor, + ), + ] + ) + def test_dispatch_quantized_conv_depthwise( + self, + _: str, + dtype: torch.dtype, + x_shape: tuple[int, ...], + w_shape: tuple[int, ...], + original_op: torch._ops.OpOverload, + expected_op: torch._ops.OpOverload, + ) -> None: + """Test quantized_conv depthwise (groups == input_channels) dispatches to correct dtype-specific variant""" + min_val, max_val = torch.iinfo(dtype).min, torch.iinfo(dtype).max + x = torch.randint(min_val, max_val, x_shape, dtype=dtype) + w = torch.randint(min_val, max_val, w_shape, dtype=dtype) b = torch.randint(-2147483648, 2147483647, (3,), dtype=torch.int32) gm = single_op_builder( placeholders=(x, w, b), - op=exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, + op=original_op, args=( x, w, @@ -654,20 +471,11 @@ def test_uint8_dispatch_quantized_conv_nhwc_depthwise(self) -> None: 0, 1, 1, - ), # groups=3 + ), ) p = CompileTimeTypeDispatchPass() gm = cast(PassResult, p(gm)).graph_module # Original op should be replaced - self.assertEqual( - count_node(gm, exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor), - 0, - ) - # Should be replaced with uint8 depthwise specific variant - self.assertEqual( - count_node( - gm, - exir_ops.edge.cadence.quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor, - ), - 1, - ) + self.assertEqual(count_node(gm, original_op), 0) + # Should be replaced with dtype-specific variant + self.assertEqual(count_node(gm, expected_op), 1) diff --git a/backends/cadence/aot/type_dispatch.py b/backends/cadence/aot/type_dispatch.py index 108c4fb1a92..958a78a4808 100644 --- a/backends/cadence/aot/type_dispatch.py +++ b/backends/cadence/aot/type_dispatch.py @@ -129,6 +129,8 @@ def call_operator( type_suffix = config.type_dispatch_suffixes[dtype_key] base_name = config.base_name + typed_op_name = f"{base_name}_{type_suffix}" + if op in [ exir_ops.edge.cadence.quantized_conv_nchw.per_tensor, exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor, @@ -140,17 +142,18 @@ def call_operator( else args[0].to_tensor().shape[-1] ) is_depthwise = groups == input_channels - - dilation = args[5] # pyre-ignore[16]: None has no attribute '__iter__'. - is_dilated = any(d > 1 for d in dilation) - - if is_dilated: - type_suffix = f"dilated_{type_suffix}" - elif is_depthwise: - type_suffix = f"depthwise_{type_suffix}" - - typed_op_name = f"{base_name}_{type_suffix}" + is_dilated = any(d > 1 for d in args[5]) + is_1d = len(args[0].to_tensor().shape) == 3 + + if is_depthwise: + typed_op_name = f"{base_name}_depthwise_{type_suffix}" + elif is_dilated: + typed_op_name = f"{base_name}_dilated_{type_suffix}" + elif is_1d and groups == 1: + typed_op_name = ( + f"quantized_conv1d_{base_name.split('_')[-1]}_{type_suffix}" + ) typed_op = getattr( getattr(exir_ops.edge.cadence, typed_op_name), config.variant diff --git a/backends/cadence/hifi/kernels/kernels.cpp b/backends/cadence/hifi/kernels/kernels.cpp index feabe6e1828..d9b60ab29cf 100644 --- a/backends/cadence/hifi/kernels/kernels.cpp +++ b/backends/cadence/hifi/kernels/kernels.cpp @@ -23,17 +23,9 @@ memcpy(void* dst, const void* src, size_t num_bytes) { void* allocate_temp_memory(KernelRuntimeContext& ctx, size_t size) { constexpr size_t kAlignment = 16; // 16-byte alignment for vectorized operations - ET_LOG( - Info, - "Attempting to allocate %zu bytes of temp memory (16-byte aligned)", - size); Result temp_mem_res = ctx.allocate_temp(size, kAlignment); if (temp_mem_res.ok()) { void* ptr = temp_mem_res.get(); - ET_LOG( - Info, - "Successfully allocated temp memory at %p (16-byte aligned)", - ptr); return ptr; } else { ET_LOG( diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp new file mode 100644 index 00000000000..c1b5a1836a3 --- /dev/null +++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp @@ -0,0 +1,189 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1))) + +using Tensor = executorch::aten::Tensor; +using KernelRuntimeContext = torch::executor::KernelRuntimeContext; +using ScalarType = executorch::aten::ScalarType; +using ::executorch::aten::IntArrayRef; + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { + +// Optimized NCHW 1D convolution for int8 x int8 -> int8 +void xa_opt_quantized_conv1d_nchw_asym8sxsym8s_asym8s( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + int32_t in_zero_point, + int32_t weight_zero_point, + float bias_scale, + float output_scale, + int32_t output_zero_point, + Tensor& out) { + constexpr int kNnlibMaxDim = 3; + + WORD8* __restrict__ p_out = + (WORD8* __restrict__)out.mutable_data_ptr(); + WORD8* __restrict__ p_inp = + (WORD8* __restrict__)input.const_data_ptr(); + WORD8* __restrict__ p_kernel = + (WORD8* __restrict__)weight.const_data_ptr(); + WORD32* __restrict__ p_bias = + (WORD32* __restrict__)bias.const_data_ptr(); + + WORD32 batches = input.size(0); + WORD32 input_channels = input.size(1); + WORD32 input_width = input.size(2); + WORD32 out_channels = weight.size(0); + WORD32 kernel_channels = weight.size(1); + WORD32 kernel_width = weight.size(2); + WORD32 out_width = out.size(2); + WORD32 x_stride = stride[1]; + WORD32 x_padding = padding[1]; + WORD32 input_zero_bias = -in_zero_point; + WORD32 out_multiplier32[out_channels]; + WORD32 out_shift32[out_channels]; + WORD32 kernel_zero_bias = -weight_zero_point; + float out_scale = 1. / output_scale; + + for (int i = 0; i < out_channels; i++) { + out_multiplier32[i] = bias_scale * out_scale * 2147483648; + out_shift32[i] = 0; + } + + WORD32 out_zero_bias = output_zero_point; + WORD32 out_data_format = 1; + WORD8* ptr1 = (WORD8*)kernels::allocate_temp_memory( + ctx, ((batches * input_channels * input_width) + 8) * sizeof(WORD8)); + WORD8* ptr2 = (WORD8*)kernels::allocate_temp_memory( + ctx, + ((out_channels * kernel_channels * kernel_width) + 8) * sizeof(WORD8)); + WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8); + WORD8* pkernel = (WORD8*)ALIGN_PTR(ptr2, 8); + + WORD32 p_inp_shape[kNnlibMaxDim]; + p_inp_shape[0] = batches; + p_inp_shape[1] = input_channels; + p_inp_shape[2] = input_width; + + WORD32 p_out_shape[kNnlibMaxDim]; + p_out_shape[0] = batches; + p_out_shape[1] = input_width; + p_out_shape[2] = input_channels; + + WORD32 p_permute_vec[kNnlibMaxDim] = {0, 2, 1}; + + xa_nn_transpose_8_8( + pin, + p_out_shape, + p_inp, + p_inp_shape, + p_permute_vec, + kNnlibMaxDim, + kNnlibMaxDim); + + WORD32 p_inp_shape1[kNnlibMaxDim]; + p_inp_shape1[0] = out_channels; + p_inp_shape1[1] = kernel_channels; + p_inp_shape1[2] = kernel_width; + + WORD32 p_out_shape1[kNnlibMaxDim]; + p_out_shape1[0] = out_channels; + p_out_shape1[1] = kernel_width; + p_out_shape1[2] = kernel_channels; + + xa_nn_transpose_8_8( + pkernel, + p_out_shape1, + p_kernel, + p_inp_shape1, + p_permute_vec, + kNnlibMaxDim, + kNnlibMaxDim); + + WORD32 scratch_size = + xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8); + scratch_size = scratch_size < 0 ? 0 : scratch_size; + WORD32* ptr_scratch = + (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size); + pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8); + + for (int _n = 0; _n < batches; _n++) { + WORD8* in_batch = pin + _n * input_channels * input_width; + WORD8* out_batch = p_out + _n * out_channels * out_width; + + xa_nn_conv1d_std_asym8xasym8( + out_batch, + in_batch, + pkernel, + p_bias, + 1, + input_width, + input_channels, + kernel_width, + out_channels, + x_stride, + x_padding, + out_width, + input_zero_bias, + kernel_zero_bias, + out_multiplier32, + out_shift32, + out_zero_bias, + out_data_format, + p_scratch); + } +} + +void quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + __ET_UNUSED IntArrayRef dilation, + __ET_UNUSED int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + Tensor& out) { + xa_opt_quantized_conv1d_nchw_asym8sxsym8s_asym8s( + ctx, + input, + weight, + bias, + stride, + padding, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out); +} + +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp new file mode 100644 index 00000000000..fae49ec97c7 --- /dev/null +++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp @@ -0,0 +1,189 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1))) + +using Tensor = executorch::aten::Tensor; +using KernelRuntimeContext = torch::executor::KernelRuntimeContext; +using ScalarType = executorch::aten::ScalarType; +using ::executorch::aten::IntArrayRef; + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { + +// Optimized NCHW 1D convolution for uint8 x uint8 -> uint8 +void xa_opt_quantized_conv1d_nchw_asym8uxsym8u_asym8u( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + int32_t in_zero_point, + int32_t weight_zero_point, + float bias_scale, + float output_scale, + int32_t output_zero_point, + Tensor& out) { + constexpr int kNnlibMaxDim = 3; + + UWORD8* __restrict__ p_out = + (UWORD8* __restrict__)out.mutable_data_ptr(); + UWORD8* __restrict__ p_inp = + (UWORD8* __restrict__)input.const_data_ptr(); + UWORD8* __restrict__ p_kernel = + (UWORD8* __restrict__)weight.const_data_ptr(); + WORD32* __restrict__ p_bias = + (WORD32* __restrict__)bias.const_data_ptr(); + + WORD32 batches = input.size(0); + WORD32 input_channels = input.size(1); + WORD32 input_width = input.size(2); + WORD32 out_channels = weight.size(0); + WORD32 kernel_channels = weight.size(1); + WORD32 kernel_width = weight.size(2); + WORD32 out_width = out.size(2); + WORD32 x_stride = stride[1]; + WORD32 x_padding = padding[1]; + WORD32 input_zero_bias = -in_zero_point; + WORD32 out_multiplier32[out_channels]; + WORD32 out_shift32[out_channels]; + WORD32 kernel_zero_bias = -weight_zero_point; + float out_scale = 1. / output_scale; + + for (int i = 0; i < out_channels; i++) { + out_multiplier32[i] = bias_scale * out_scale * 2147483648; + out_shift32[i] = 0; + } + + WORD32 out_zero_bias = output_zero_point; + WORD32 out_data_format = 1; + UWORD8* ptr1 = (UWORD8*)kernels::allocate_temp_memory( + ctx, ((batches * input_channels * input_width) + 8) * sizeof(UWORD8)); + UWORD8* ptr2 = (UWORD8*)kernels::allocate_temp_memory( + ctx, + ((out_channels * kernel_channels * kernel_width) + 8) * sizeof(UWORD8)); + UWORD8* pin = (UWORD8*)ALIGN_PTR(ptr1, 8); + UWORD8* pkernel = (UWORD8*)ALIGN_PTR(ptr2, 8); + + WORD32 p_inp_shape[kNnlibMaxDim]; + p_inp_shape[0] = batches; + p_inp_shape[1] = input_channels; + p_inp_shape[2] = input_width; + + WORD32 p_out_shape[kNnlibMaxDim]; + p_out_shape[0] = batches; + p_out_shape[1] = input_width; + p_out_shape[2] = input_channels; + + WORD32 p_permute_vec[kNnlibMaxDim] = {0, 2, 1}; + + xa_nn_transpose_8_8( + pin, + p_out_shape, + p_inp, + p_inp_shape, + p_permute_vec, + kNnlibMaxDim, + kNnlibMaxDim); + + WORD32 p_inp_shape1[kNnlibMaxDim]; + p_inp_shape1[0] = out_channels; + p_inp_shape1[1] = kernel_channels; + p_inp_shape1[2] = kernel_width; + + WORD32 p_out_shape1[kNnlibMaxDim]; + p_out_shape1[0] = out_channels; + p_out_shape1[1] = kernel_width; + p_out_shape1[2] = kernel_channels; + + xa_nn_transpose_8_8( + pkernel, + p_out_shape1, + p_kernel, + p_inp_shape1, + p_permute_vec, + kNnlibMaxDim, + kNnlibMaxDim); + + WORD32 scratch_size = + xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8); + scratch_size = scratch_size < 0 ? 0 : scratch_size; + WORD32* ptr_scratch = + (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size); + pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8); + + for (int _n = 0; _n < batches; _n++) { + UWORD8* in_batch = pin + _n * input_channels * input_width; + UWORD8* out_batch = p_out + _n * out_channels * out_width; + + xa_nn_conv1d_std_asym8uxasym8u( + out_batch, + in_batch, + pkernel, + p_bias, + 1, + input_width, + input_channels, + kernel_width, + out_channels, + x_stride, + x_padding, + out_width, + input_zero_bias, + kernel_zero_bias, + out_multiplier32, + out_shift32, + out_zero_bias, + out_data_format, + p_scratch); + } +} + +void quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + __ET_UNUSED IntArrayRef dilation, + __ET_UNUSED int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + Tensor& out) { + xa_opt_quantized_conv1d_nchw_asym8uxsym8u_asym8u( + ctx, + input, + weight, + bias, + stride, + padding, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out); +} + +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp new file mode 100644 index 00000000000..a2cb591b3a7 --- /dev/null +++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp @@ -0,0 +1,138 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1))) + +using Tensor = executorch::aten::Tensor; +using KernelRuntimeContext = torch::executor::KernelRuntimeContext; +using ScalarType = executorch::aten::ScalarType; +using ::executorch::aten::IntArrayRef; + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { + +// Optimized NHWC 1D convolution for int8 x int8 -> int8 +void xa_opt_quantized_conv1d_nhwc_asym8sxsym8s_asym8s( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + int32_t in_zero_point, + int32_t weight_zero_point, + float bias_scale, + float output_scale, + int32_t output_zero_point, + Tensor& out) { + WORD8* __restrict__ p_out = + (WORD8* __restrict__)out.mutable_data_ptr(); + WORD8* __restrict__ p_inp = + (WORD8* __restrict__)input.const_data_ptr(); + WORD8* __restrict__ p_kernel = + (WORD8* __restrict__)weight.const_data_ptr(); + WORD32* __restrict__ p_bias = + (WORD32* __restrict__)bias.const_data_ptr(); + + WORD32 batches = input.size(0); + WORD32 input_channels = input.size(1); + WORD32 input_width = input.size(2); + WORD32 out_channels = weight.size(0); + WORD32 kernel_width = weight.size(2); + WORD32 out_width = out.size(2); + WORD32 x_stride = stride[1]; + WORD32 x_padding = padding[1]; + WORD32 input_zero_bias = -in_zero_point; + WORD32 out_multiplier32[out_channels]; + WORD32 out_shift32[out_channels]; + WORD32 kernel_zero_bias = -weight_zero_point; + float out_scale = 1. / output_scale; + + for (int i = 0; i < out_channels; i++) { + out_multiplier32[i] = bias_scale * out_scale * 2147483648; + out_shift32[i] = 0; + } + + WORD32 out_zero_bias = output_zero_point; + WORD32 out_data_format = 0; + WORD32 scratch_size = + xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8); + scratch_size = scratch_size < 0 ? 0 : scratch_size; + WORD32* ptr_scratch = + (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size); + pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8); + + for (int _n = 0; _n < batches; _n++) { + WORD8* in_batch = p_inp + _n * input_channels * input_width; + WORD8* out_batch = p_out + _n * out_channels * out_width; + + xa_nn_conv1d_std_asym8xasym8( + out_batch, + in_batch, + p_kernel, + p_bias, + 1, + input_width, + input_channels, + kernel_width, + out_channels, + x_stride, + x_padding, + out_width, + input_zero_bias, + kernel_zero_bias, + out_multiplier32, + out_shift32, + out_zero_bias, + out_data_format, + p_scratch); + } +} + +void quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + __ET_UNUSED IntArrayRef dilation, + __ET_UNUSED int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + Tensor& out) { + xa_opt_quantized_conv1d_nhwc_asym8sxsym8s_asym8s( + ctx, + input, + weight, + bias, + stride, + padding, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out); +} + +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp new file mode 100644 index 00000000000..441952ca189 --- /dev/null +++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp @@ -0,0 +1,138 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1))) + +using Tensor = executorch::aten::Tensor; +using KernelRuntimeContext = torch::executor::KernelRuntimeContext; +using ScalarType = executorch::aten::ScalarType; +using ::executorch::aten::IntArrayRef; + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { + +// Optimized NHWC 1D convolution for uint8 x uint8 -> uint8 +void xa_opt_quantized_conv1d_nhwc_asym8uxsym8u_asym8u( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + int32_t in_zero_point, + int32_t weight_zero_point, + float bias_scale, + float output_scale, + int32_t output_zero_point, + Tensor& out) { + UWORD8* __restrict__ p_out = + (UWORD8* __restrict__)out.mutable_data_ptr(); + UWORD8* __restrict__ p_inp = + (UWORD8* __restrict__)input.const_data_ptr(); + UWORD8* __restrict__ p_kernel = + (UWORD8* __restrict__)weight.const_data_ptr(); + WORD32* __restrict__ p_bias = + (WORD32* __restrict__)bias.const_data_ptr(); + + WORD32 batches = input.size(0); + WORD32 input_channels = input.size(1); + WORD32 input_width = input.size(2); + WORD32 out_channels = weight.size(0); + WORD32 kernel_width = weight.size(2); + WORD32 out_width = out.size(2); + WORD32 x_stride = stride[1]; + WORD32 x_padding = padding[1]; + WORD32 input_zero_bias = -in_zero_point; + WORD32 out_multiplier32[out_channels]; + WORD32 out_shift32[out_channels]; + WORD32 kernel_zero_bias = -weight_zero_point; + float out_scale = 1. / output_scale; + + for (int i = 0; i < out_channels; i++) { + out_multiplier32[i] = bias_scale * out_scale * 2147483648; + out_shift32[i] = 0; + } + + WORD32 out_zero_bias = output_zero_point; + WORD32 out_data_format = 0; + WORD32 scratch_size = + xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8); + scratch_size = scratch_size < 0 ? 0 : scratch_size; + WORD32* ptr_scratch = + (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size); + pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8); + + for (int _n = 0; _n < batches; _n++) { + UWORD8* in_batch = p_inp + _n * input_channels * input_width; + UWORD8* out_batch = p_out + _n * out_channels * out_width; + + xa_nn_conv1d_std_asym8uxasym8u( + out_batch, + in_batch, + p_kernel, + p_bias, + 1, + input_width, + input_channels, + kernel_width, + out_channels, + x_stride, + x_padding, + out_width, + input_zero_bias, + kernel_zero_bias, + out_multiplier32, + out_shift32, + out_zero_bias, + out_data_format, + p_scratch); + } +} + +void quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + __ET_UNUSED IntArrayRef dilation, + __ET_UNUSED int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + Tensor& out) { + xa_opt_quantized_conv1d_nhwc_asym8uxsym8u_asym8u( + ctx, + input, + weight, + bias, + stride, + padding, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out); +} + +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl index d310396c262..fa263d4017c 100644 --- a/backends/cadence/hifi/operators/targets.bzl +++ b/backends/cadence/hifi/operators/targets.bzl @@ -66,6 +66,8 @@ OPERATORS = [ "quantized_conv_nchw_out", "quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out", "quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out", + "quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out", + "quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out", "quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out", "quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out", "quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out", @@ -73,6 +75,8 @@ OPERATORS = [ "quantized_conv_nhwc_out", "quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out", "quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out", + "quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out", + "quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out", "quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out", "quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out", "quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out", diff --git a/backends/cadence/reference/operators/quantized_conv_nchw_out.cpp b/backends/cadence/reference/operators/quantized_conv_nchw_out.cpp index aefa75d7047..1a4faeed250 100644 --- a/backends/cadence/reference/operators/quantized_conv_nchw_out.cpp +++ b/backends/cadence/reference/operators/quantized_conv_nchw_out.cpp @@ -496,6 +496,72 @@ void quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out( out); } +void quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + Tensor& out) { + quantized_conv_nchw( + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out); +} + +void quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + Tensor& out) { + quantized_conv_nchw( + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out); +} + } // namespace native } // namespace reference } // namespace impl diff --git a/backends/cadence/reference/operators/quantized_conv_nhwc_out.cpp b/backends/cadence/reference/operators/quantized_conv_nhwc_out.cpp index 26fbc86d5b0..21b17fb0724 100644 --- a/backends/cadence/reference/operators/quantized_conv_nhwc_out.cpp +++ b/backends/cadence/reference/operators/quantized_conv_nhwc_out.cpp @@ -417,6 +417,72 @@ void quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out( out); } +void quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + Tensor& out) { + quantized_conv_nhwc( + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out); +} + +void quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + Tensor& out) { + quantized_conv_nhwc( + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out); +} + void quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, diff --git a/backends/mediatek/runtime/include/api/NeuronAdapter.h b/backends/mediatek/runtime/include/api/NeuronAdapter.h deleted file mode 100644 index 3a4af8299b0..00000000000 --- a/backends/mediatek/runtime/include/api/NeuronAdapter.h +++ /dev/null @@ -1,2385 +0,0 @@ -/* Copyright Statement: - * - * This software/firmware and related documentation ("MediaTek Software") are - * protected under relevant copyright laws. The information contained herein - * is confidential and proprietary to MediaTek Inc. and/or its licensors. - * Without the prior written permission of MediaTek inc. and/or its licensors, - * any reproduction, modification, use or disclosure of MediaTek Software, - * and information contained herein, in whole or in part, shall be strictly - * prohibited. - */ -/* MediaTek Inc. (C) 2020. All rights reserved. - * - * BY OPENING THIS FILE, RECEIVER HEREBY UNEQUIVOCALLY ACKNOWLEDGES AND AGREES - * THAT THE SOFTWARE/FIRMWARE AND ITS DOCUMENTATIONS ("MEDIATEK SOFTWARE") - * RECEIVED FROM MEDIATEK AND/OR ITS REPRESENTATIVES ARE PROVIDED TO RECEIVER ON - * AN "AS-IS" BASIS ONLY. MEDIATEK EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE OR NONINFRINGEMENT. - * NEITHER DOES MEDIATEK PROVIDE ANY WARRANTY WHATSOEVER WITH RESPECT TO THE - * SOFTWARE OF ANY THIRD PARTY WHICH MAY BE USED BY, INCORPORATED IN, OR - * SUPPLIED WITH THE MEDIATEK SOFTWARE, AND RECEIVER AGREES TO LOOK ONLY TO SUCH - * THIRD PARTY FOR ANY WARRANTY CLAIM RELATING THERETO. RECEIVER EXPRESSLY - * ACKNOWLEDGES THAT IT IS RECEIVER'S SOLE RESPONSIBILITY TO OBTAIN FROM ANY - * THIRD PARTY ALL PROPER LICENSES CONTAINED IN MEDIATEK SOFTWARE. MEDIATEK - * SHALL ALSO NOT BE RESPONSIBLE FOR ANY MEDIATEK SOFTWARE RELEASES MADE TO - * RECEIVER'S SPECIFICATION OR TO CONFORM TO A PARTICULAR STANDARD OR OPEN - * FORUM. RECEIVER'S SOLE AND EXCLUSIVE REMEDY AND MEDIATEK'S ENTIRE AND - * CUMULATIVE LIABILITY WITH RESPECT TO THE MEDIATEK SOFTWARE RELEASED HEREUNDER - * WILL BE, AT MEDIATEK'S OPTION, TO REVISE OR REPLACE THE MEDIATEK SOFTWARE AT - * ISSUE, OR REFUND ANY SOFTWARE LICENSE FEES OR SERVICE CHARGE PAID BY RECEIVER - * TO MEDIATEK FOR SUCH MEDIATEK SOFTWARE AT ISSUE. - * - * The following software/firmware and/or related documentation ("MediaTek - * Software") have been modified by MediaTek Inc. All revisions are subject to - * any receiver's applicable license agreements with MediaTek Inc. - */ - -/** - * @file NeuronAdapter.h - */ - -#pragma once - -#ifdef __ANDROID__ -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wnullability-extension" -#include -#pragma clang diagnostic pop -#endif - -#include -#include -#include - -__BEGIN_DECLS - -/** - * NeuronModel is an opaque type that contains a description of the mathematical - * operations that constitute the model. - */ -typedef struct NeuronModel NeuronModel; - -/** - * NeuronCompilation is an opaque type that can be used to compile a machine - * learning model. - */ -typedef struct NeuronCompilation NeuronCompilation; - -/** - * NeuronExecution is an opaque type that can be used to apply a machine - * learning model to a set of inputs. - */ -typedef struct NeuronExecution NeuronExecution; - -/** - * NeuronDevice is an opaque type that represents a device. - * - * This type is used to query basic properties and supported operations of the - * corresponding device, and control which device(s) a model is to be run on. - * - * Available since 4.1.0 - */ -typedef struct NeuronDevice NeuronDevice; - -/** - * This type is used to represent shared memory, memory mapped files, and - * similar memories. - * - * It is the application's responsibility to ensure that there are no uses of - * the memory after calling NeuronMemory_free. This includes the execution which - * references this memory because of a call to - * NeuronExecution_setInputFromMemory or NeuronExecution_setOutputFromMemory. - * - * Available since 4.1.0 - */ -typedef struct NeuronMemory NeuronMemory; - -/** - * NeuronEvent is an opaque type that represents an event - * that will be signaled once an execution completes. - * - * Available since 5.0.0 - */ -typedef struct NeuronEvent NeuronEvent; - -/** - * Result codes. - */ -typedef enum { - NEURON_NO_ERROR = 0, - NEURON_OUT_OF_MEMORY = 1, - NEURON_INCOMPLETE = 2, - NEURON_UNEXPECTED_NULL = 3, - NEURON_BAD_DATA = 4, - NEURON_OP_FAILED = 5, - NEURON_UNMAPPABLE = 6, - NEURON_BAD_STATE = 7, - NEURON_BAD_VERSION = 8, - - // Available since 5.0.0 - NEURON_OUTPUT_INSUFFICIENT_SIZE = 9, - NEURON_UNAVAILABLE_DEVICE = 10, - NEURON_MISSED_DEADLINE_TRANSIENT = 11, - NEURON_MISSED_DEADLINE_PERSISTENT = 12, - NEURON_RESOURCE_EXHAUSTED_TRANSIENT = 13, - NEURON_RESOURCE_EXHAUSTED_PERSISTENT = 14, - NEURON_DEAD_OBJECT = 15, -} NeuronAdapterResultCode; - -/** - * Operand values with size in bytes that are smaller or equal to this will be - * immediately copied into the model. - */ -enum { NEURON_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES = 128 }; - -/** - * Size of the cache token, in bytes, required from the application. - */ -enum { NEURON_BYTE_SIZE_OF_CACHE_TOKEN = 32 }; - -/** - * Operand types. - * The type of operands that can be added to a model. - * - * Some notes on quantized tensors - * - *

NEURON_TENSOR_QUANT8_ASYMM - *

Attached to this tensor are two numbers that can be used to convert the 8 - * bit integer to the real value and vice versa. These two numbers are: - * - scale: a 32 bit floating point value greater than zero. - * - zeroPoint: a 32 bit integer, in range [0, 255]. - *

The formula is: real_value = (integer_value - zero_value) * scale. - * - *

NEURON_TENSOR_QUANT16_SYMM - *

Attached to this tensor is a number representing real value scale that is - * used to convert the 16 bit number to a real value in the following way: - * realValue = integerValue * scale. scale is a 32 bit floating point with value - * greater than zero. - * - *

NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL - *

This tensor is associated with additional fields that can be used to - * convert the 8 bit signed integer to the real value and vice versa. These - * fields are: - * - channelDim: a 32 bit unsigned integer indicating channel dimension. - * - scales: an array of positive 32 bit floating point values. - *

The size of the scales array must be equal to dimensions[channelDim]. - * NeuronModel_setOperandSymmPerChannelQuantParams must be used to set the - * parameters for an Operand of this type. The channel dimension of this tensor - * must not be unknown (dimensions[channelDim] != 0). The formula is: - * realValue[..., C, ...] = integerValue[..., C, ...] * scales[C] where C is an - * index in the Channel dimension. - * - *

NEURON_TENSOR_QUANT16_ASYMM - *

Attached to this tensor are two numbers that can be used to convert the 16 - * bit integer to the real value and vice versa. These two numbers are: - * - scale: a 32 bit floating point value greater than zero. - * - zeroPoint: a 32 bit integer, in range [0, 65535]. - *

The formula is: real_value = (integer_value - zeroPoint) * scale. - * - *

NEURON_TENSOR_QUANT8_SYMM - *

Attached to this tensor is a number representing real value scale that is - * used to convert the 8 bit number to a real value in the following way: - * realValue = integerValue * scale. scale is a 32 bit floating point with value - * greater than zero. - * - *

NEURON_TENSOR_QUANT8_ASYMM_SIGNED - *

Attached to this tensor are two numbers that can be used to convert the 8 - * bit integer to the real value and vice versa. These two numbers are: - * - scale: a 32 bit floating point value greater than zero. - * - zeroPoint: a 32 bit integer, in range [-128, 127]. - *

The formula is: real_value = (integer_value - zeroPoint) * scale. - */ -enum { - /** A 32 bit floating point scalar value. */ - NEURON_FLOAT32 = 0, - /** A signed 32 bit integer scalar value. */ - NEURON_INT32 = 1, - /** An unsigned 32 bit integer scalar value. */ - NEURON_UINT32 = 2, - /** A tensor of 32 bit floating point values. */ - NEURON_TENSOR_FLOAT32 = 3, - /** A tensor of 32 bit integer values. */ - NEURON_TENSOR_INT32 = 4, - /** A tensor of 8 bit integers that represent real numbers. */ - NEURON_TENSOR_QUANT8_ASYMM = 5, - /** An 8 bit boolean scalar value. */ - NEURON_BOOL = 6, - /** A tensor of 16 bit signed integers that represent real numbers. */ - NEURON_TENSOR_QUANT16_SYMM = 7, - /** A tensor of IEEE 754 16 bit floating point values. */ - NEURON_TENSOR_FLOAT16 = 8, - /** A tensor of 8 bit boolean values. */ - NEURON_TENSOR_BOOL8 = 9, - /** An IEEE 754 16 bit floating point scalar value. */ - NEURON_FLOAT16 = 10, - /** A tensor of 8 bit signed integers that represent real numbers. */ - NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL = 11, - /** A tensor of 16 bit unsigned integers that represent real numbers. */ - NEURON_TENSOR_QUANT16_ASYMM = 12, - /** A tensor of 8 bit signed integers that represent real numbers. */ - NEURON_TENSOR_QUANT8_SYMM = 13, - /** A tensor of 8 bit signed integers that represent real numbers. */ - NEURON_TENSOR_QUANT8_ASYMM_SIGNED = 14, - /** A reference to a model. */ - NEURON_MODEL = 15, - /** Extended data type - tensor uint32 */ - NEURON_EXT_TENSOR_UINT32 = 9001, - /** Extended data type -A tensor of 8 bit unsigned integers that represent - real numbers. */ - NEURON_EXT_TENSOR_QUANT8_ASYMM_PER_CHANNEL = 9002, - /** Extended data type -A tensor of 4 bit unsigned integers that represent - real numbers. */ - NEURON_EXT_TENSOR_QUANT4_ASYMM = 9003, - /** Extended data type -A tensor of 4 bit signed integers that represent real - numbers. */ - NEURON_EXT_TENSOR_QUANT4_ASYMM_SIGNED = 9004, - /** Extended data type -A tensor of 4 bit signed integers that represent real - numbers. */ - NEURON_EXT_TENSOR_QUANT4_SYMM = 9005, - /** Extended data type -A tensor of 16 bit signed integers that represent real - numbers. */ - NEURON_EXT_TENSOR_QUANT16_ASYMM_SIGNED = 9006, - /** Extended data type -A raw tensor. */ - NEURON_EXT_TENSOR_RAW = 9007, - /** Extended data type -A tensor of 8 bit signed integers that represent real - numbers. */ - NEURON_EXT_TENSOR_QUANT8_ASYMM_SIGNED_PER_CHANNEL = 9008, -}; - -/** - * NeuronOperandType describes the type of an operand. - * This structure is used to describe both scalars and tensors. - */ -typedef struct NeuronOperandType { - /** The data type, e.g NEURON_INT8. */ - int32_t type; - /** The number of dimensions. It should be 0 for scalars. */ - uint32_t dimensionCount; - /** The dimensions of the tensor. It should be nullptr for scalars. */ - const uint32_t* dimensions; - /** - * These two fields are only used for quantized tensors. - * They should be zero for scalars and non-fixed point tensors. - * The dequantized value of each entry is (value - zeroPoint) * scale. - */ - float scale; - /** Only used with scale for quantized tensors */ - int32_t zeroPoint; -} NeuronOperandType; - -/** - * Parameters for NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL operand. - */ -typedef struct NeuronSymmPerChannelQuantParams { - /** The index of the channel dimension. */ - uint32_t channelDim; - /** The size of the scale array. Should be equal to dimension[channelDim] of - * the Operand. */ - uint32_t scaleCount; - /** The array of scaling values for each channel. Each value must be greater - * than zero. */ - const float* scales; -} NeuronSymmPerChannelQuantParams; - -/** - * Parameters for NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL and - * NEURON_TENSOR_QUANT8_ASYMM_PER_CHANNEL operand. - */ -typedef struct NeuronPerChannelQuantParams { - /** The index of the channel dimension. */ - uint32_t channelDim; - /** The size of the scale array. Should be equal to dimension[channelDim] of - * the Operand. */ - uint32_t scaleCount; - /** The array of scaling values for each channel. Each value must be greater - * than zero. */ - const float* scales; - /** The size of the zeroPoints. Should be equal to dimension[channelDim] of - * the Operand. */ - uint32_t zeroPointCount; - /** The array of zero point values for each channel. */ - const int32_t* zeroPoints; -} NeuronPerChannelQuantParams; - -/** - * Operation Types - * - * Supported operations are listed with available versions. See - * Neuron_getVersion for querying version number. - * - * Attempting to compile models with operations marked as not available - * will get a compilation failure. - * - * Refer to the operation support status of each hardware platform. - * Attempting to compile models with operations supported by this library but - * not supported by the underlying hardware platform will get a compilation - * failure too. - * - * Compatible NNAPI levels are also listed. - */ -typedef enum { - NEURON_ADD = 0, ///< Available since 4.1.0. NNAPI level 30. - NEURON_AVERAGE_POOL_2D = 1, ///< Available since 4.1.0. NNAPI level 30. - NEURON_CONCATENATION = 2, ///< Available since 4.1.0. NNAPI level 30. - NEURON_CONV_2D = 3, ///< Available since 4.1.0. NNAPI level 30. - NEURON_DEPTHWISE_CONV_2D = 4, ///< Available since 4.1.0. NNAPI level 30. - NEURON_DEPTH_TO_SPACE = 5, ///< Available since 4.1.0. NNAPI level 30. - NEURON_DEQUANTIZE = 6, ///< Available since 4.1.0. NNAPI level 30. - NEURON_EMBEDDING_LOOKUP = 7, ///< Not available. - NEURON_FLOOR = 8, ///< Available since 4.1.0. NNAPI level 30. - NEURON_FULLY_CONNECTED = 9, ///< Available since 4.1.0. NNAPI level 30. - NEURON_HASHTABLE_LOOKUP = 10, ///< Not available. - NEURON_L2_NORMALIZATION = 11, ///< Available since 4.1.0. NNAPI level 30. - NEURON_L2_POOL_2D = 12, ///< Available since 4.1.0. NNAPI level 30. - NEURON_LOCAL_RESPONSE_NORMALIZATION = 13, ///< Not available. - NEURON_LOGISTIC = 14, ///< Available since 4.1.0. NNAPI level 30. - NEURON_LSH_PROJECTION = 15, ///< Not available. - NEURON_LSTM = 16, ///< Not available. - NEURON_MAX_POOL_2D = 17, ///< Available since 4.1.0. NNAPI level 30. - NEURON_MUL = 18, ///< Available since 4.1.0. NNAPI level 30. - NEURON_RELU = 19, ///< Available since 4.1.0. NNAPI level 30. - NEURON_RELU1 = 20, ///< Available since 4.1.0. NNAPI level 30. - NEURON_RELU6 = 21, ///< Available since 4.1.0. NNAPI level 30. - NEURON_RESHAPE = 22, ///< Available since 4.1.0. NNAPI level 30. - NEURON_RESIZE_BILINEAR = 23, ///< Available since 4.1.0. NNAPI level 30. - NEURON_RNN = 24, ///< Not available. - NEURON_SOFTMAX = 25, ///< Available since 4.1.0. NNAPI level 30. - NEURON_SPACE_TO_DEPTH = 26, ///< Available since 4.1.0. NNAPI level 30. - NEURON_SVDF = 27, ///< Not available. - NEURON_TANH = 28, ///< Available since 4.1.0. NNAPI level 30. - NEURON_BATCH_TO_SPACE_ND = 29, ///< Available since 4.1.0. NNAPI level 30. - NEURON_DIV = 30, ///< Available since 4.1.0. NNAPI level 30. - NEURON_MEAN = 31, ///< Available since 4.1.0. NNAPI level 30. - NEURON_PAD = 32, ///< Available since 4.1.0. NNAPI level 30. - NEURON_SPACE_TO_BATCH_ND = 33, ///< Available since 4.1.0. NNAPI level 30. - NEURON_SQUEEZE = 34, ///< Available since 4.1.0. NNAPI level 30. - NEURON_STRIDED_SLICE = 35, ///< Available since 4.1.0. NNAPI level 30. - NEURON_SUB = 36, ///< Available since 4.1.0. NNAPI level 30. - NEURON_TRANSPOSE = 37, ///< Available since 4.1.0. NNAPI level 30. - NEURON_ABS = 38, ///< Available since 4.1.0. NNAPI level 30. - NEURON_ARGMAX = 39, ///< Available since 4.1.0. NNAPI level 30. - NEURON_ARGMIN = 40, ///< Available since 4.1.0. NNAPI level 30. - NEURON_AXIS_ALIGNED_BBOX_TRANSFORM = - 41, ///< Available since 4.1.0. NNAPI level 30. - NEURON_BIDIRECTIONAL_SEQUENCE_LSTM = 42, ///< Not available. - NEURON_BIDIRECTIONAL_SEQUENCE_RNN = 43, ///< Not available. - NEURON_BOX_WITH_NMS_LIMIT = 44, ///< Available since 4.1.0. NNAPI level 30. - NEURON_CAST = 45, ///< Available since 4.1.0. NNAPI level 30. - NEURON_CHANNEL_SHUFFLE = 46, ///< Available since 4.1.0. NNAPI level 30. - NEURON_DETECTION_POSTPROCESSING = 47, ///< Not available. - NEURON_EQUAL = 48, ///< Available since 4.1.0. NNAPI level 30. - NEURON_EXP = 49, ///< Available since 4.1.0. NNAPI level 30. - NEURON_EXPAND_DIMS = 50, ///< Available since 4.1.0. NNAPI level 30. - NEURON_GATHER = 51, ///< Available since 4.1.0. NNAPI level 30. - NEURON_GENERATE_PROPOSALS = 52, ///< Not available. - NEURON_GREATER = 53, ///< Available since 4.1.0. NNAPI level 30. - NEURON_GREATER_EQUAL = 54, ///< Available since 4.1.0. NNAPI level 30. - NEURON_GROUPED_CONV_2D = 55, ///< Available since 4.1.0. NNAPI level 30. - NEURON_HEATMAP_MAX_KEYPOINT = 56, ///< Available since 4.1.0. NNAPI level 30. - NEURON_INSTANCE_NORMALIZATION = - 57, ///< Available since 4.1.0. NNAPI level 30. - NEURON_LESS = 58, ///< Available since 4.1.0. NNAPI level 30. - NEURON_LESS_EQUAL = 59, ///< Available since 4.1.0. NNAPI level 30. - NEURON_LOG = 60, ///< Not available. - NEURON_LOGICAL_AND = 61, ///< Available since 4.1.0. NNAPI level 30. - NEURON_LOGICAL_NOT = 62, ///< Available since 4.1.0. NNAPI level 30. - NEURON_LOGICAL_OR = 63, ///< Available since 4.1.0. NNAPI level 30. - NEURON_LOG_SOFTMAX = 64, ///< Not available. - NEURON_MAXIMUM = 65, ///< Available since 4.1.0. NNAPI level 30. - NEURON_MINIMUM = 66, ///< Available since 4.1.0. NNAPI level 30. - NEURON_NEG = 67, ///< Available since 4.1.0. NNAPI level 30. - NEURON_NOT_EQUAL = 68, ///< Available since 4.1.0. NNAPI level 30. - NEURON_PAD_V2 = 69, ///< Available since 4.1.0. NNAPI level 30. - NEURON_POW = 70, ///< Available since 4.1.0. NNAPI level 30. - NEURON_PRELU = 71, ///< Available since 4.1.0. NNAPI level 30. - NEURON_QUANTIZE = 72, ///< Available since 4.1.0. NNAPI level 30. - NEURON_QUANTIZED_16BIT_LSTM = 73, ///< Available since 4.1.0. NNAPI level 30. - NEURON_RANDOM_MULTINOMIAL = 74, ///< Not available. - NEURON_REDUCE_ALL = 75, ///< Available since 4.1.0. NNAPI level 30. - NEURON_REDUCE_ANY = 76, ///< Available since 4.1.0. NNAPI level 30. - NEURON_REDUCE_MAX = 77, ///< Available since 4.1.0. NNAPI level 30. - NEURON_REDUCE_MIN = 78, ///< Available since 4.1.0. NNAPI level 30. - NEURON_REDUCE_PROD = 79, ///< Not available. - NEURON_REDUCE_SUM = 80, ///< Available since 4.1.0. NNAPI level 30. - NEURON_ROI_ALIGN = 81, ///< Available since 4.1.0. NNAPI level 30. - NEURON_ROI_POOLING = 82, ///< Not available. - NEURON_RSQRT = 83, ///< Available since 4.1.0. NNAPI level 30. - NEURON_SELECT = 84, ///< Available since 4.1.0. NNAPI level 30. - NEURON_SIN = 85, ///< Not available. - NEURON_SLICE = 86, ///< Available since 4.1.0. NNAPI level 30. - NEURON_SPLIT = 87, ///< Available since 4.1.0. NNAPI level 30. - NEURON_SQRT = 88, ///< Available since 4.1.0. NNAPI level 30. - NEURON_TILE = 89, ///< Available since 4.1.0. NNAPI level 30. - NEURON_TOPK_V2 = 90, ///< Available since 4.1.0. NNAPI level 30. - NEURON_TRANSPOSE_CONV_2D = 91, ///< Available since 4.1.0. NNAPI level 30. - NEURON_UNIDIRECTIONAL_SEQUENCE_LSTM = 92, ///< Not available. - NEURON_UNIDIRECTIONAL_SEQUENCE_RNN = 93, ///< Not available. - NEURON_RESIZE_NEAREST_NEIGHBOR = - 94, ///< Available since 4.1.0. NNAPI level 30. - NEURON_QUANTIZED_LSTM = 95, ///< Not available. - NEURON_IF = 96, ///< Available since 4.1.0. NNAPI level 30. - NEURON_WHILE = 97, ///< Available since 4.1.0. NNAPI level 30. - NEURON_ELU = 98, ///< Not available. - NEURON_HARD_SWISH = 99, ///< Available since 4.1.0. NNAPI level 30. - NEURON_FILL = 100, ///< Available since 4.1.0. NNAPI level 30. - NEURON_RANK = 101, ///< Not available. - NEURON_BATCH_MATMUL = 102, ///< Available since 5.1.2. NNAPI FL6. - NEURON_PACK = 103, ///< Not available. - NEURON_MIRROR_PAD = 104, ///< Not available. - NEURON_MIRROR_REVERSE = 105, ///< Not available. - /** - * Decompress HyFBC to YUV420 frame, support both YUV420_8BITS and - * YUV420_10BITS formats. HyFBC (Hybrid Frame Buffer Compression) is a - * compressed format used by video decoder (VDEC). This format uses YUV420 to - * compress. - * - * For input part, need to set two inputs with different shape, representing Y - * and UV plane respectively. The same HyFBC data will be used for both - * inputs. Similarly, the output part also needs to be set to two, - * representing Y and UV plane respectively. - * - * The shape of the two inputs/ outputs (inputY, inputUV, outputY, outputUV) - * depends on the original images' shape ([batches, height, width, channels]). - * Both height and width shold follow 64 alignment rule. For example, if - * original height is 480, its 64 alignment should be 512. For Y plane, - * channel size should be 1; for UV plane, channel size should be 2. Besides, - * the height and width of UV plane should be half of Y's height and width. - * Example: - * - * original_img.shape = [1, 384, 640, 3] - * inputY.shape = [1, 384, 640, 1] - * inputUV.shape = [1, 192, 320, 2] - * outputY.shape = [1, 384, 640, 1] - * outputUV.shape = [1, 192, 320, 2] - * - * Supported tensor {@link OperandCode}: - * * {@link NEURON_EXT_TENSOR_RAW} (for inputY, inputUV) - * * {@link NEURON_TENSOR_QUANT8_ASYMM} (for outputY, outputUV) - * * {@link NEURON_TENSOR_QUANT16_ASYMM} (for outputY, outputUV) - * Note: - * If image mode is YUV420_8BITS, use NEURON_TENSOR_QUANT8_ASYMM; if mode is - * YUV420_10BITS, use NEURON_TENSOR_QUANT16_ASYMM. - * - * Tensor rank: both input and output require rank 4, with "NHWC" data layout. - * - * Inputs: - * * 0: inputY, a 4-D {@link NEURON_EXT_TENSOR_RAW} tensor. - * * 1: inputUV, a 4-D {@link NEURON_EXT_TENSOR_RAW} tensor. - * * 2: YHeaderAlignment, an {@link NEURON_INT32} scalar, specifying - * the header alignment in Hyfbc format. - * * 3: UVHeaderAlignment, an {@link NEURON_INT32} scalar, specifying - * the header alignment in Hyfbc format. - * * 4: xAlign, an {@link NEURON_INT32} scalar, specifying the frame - * width alignment of video decoder. - * * 5: yAlign, an {@link NEURON_INT32} scalar, specifying the frame - * height alignment of video decoder. - * * 6: xOffset, an {@link NEURON_INT32} scalar, specifying the frame - * width offset of video decoder. - * * 7: yOffset, an {@link NEURON_INT32} scalar, specifying the frame - * height offset of video decoder. - * * 8: mode, an {@link NEURON_INT32} scalar. Set to 0 for - * YUV420_8BITS. Set to 1 for YUV420_10BITS. Note that 8b, 10b here means the - * compressed bit width in Hyfbc frame, where the decompressed YUV420 is 8b - * for Hyfbc_8b, and YUV420 is 16b for Hyfbc_10b. - * * 9: outPitchN, an {@link NEURON_INT32} scalar, specifying the - * YUV420 N-axis pitch. Must be set to 1, because only a single batch is - * supported for HyfbcDecompress. - * * 10: outPitchH, an {@link NEURON_INT32} scalar, specifying the - * YUV420 H-axis pitch. Set to the original compressed image height with video - * codec alignment. - * * 11: outPitchW, an {@link NEURON_INT32} scalar, specifying the - * YUV420 W-axis pitch. Set to the original compressed image width with video - * codec alignment. - * * 12: outPitchC, an {@link NEURON_INT32} scalar, specifying the - * YUV420 C-axis pitch. Set to 1 for interleaved YUV420. - * - * Outputs: - * * 0: output Y, a 4-D tensor. Tensor type can be either {@link - * NEURON_TENSOR_QUANT8_ASYMM} or {@link - * NEURON_TENSOR_QUANT16_ASYMM}, depends on YUV420 bit mode. - * * 1: output UV, a 4-D tensor. Tensor type can be either {@link - * NEURON_TENSOR_QUANT8_ASYMM} or {@link - * NEURON_TENSOR_QUANT16_ASYMM}, depends on YUV420 bit mode. - * - * Available since NeuroPilot 7.0.0. - */ - NEURON_HYFBCTOYUV420 = 106, - /** - * Compress YUV420 to AFBC frame, support both YUV420_8BITS and - * YUV420_10BITS formats. AFBC (Arm Frame Buffer Compression) is a lossless - * compressed image format, created by ARM to reduce the size of images. - * - * For input part, need to set two inputs with different shape, representing Y - * and UV plane respectively. For output part, need to set one output for - * AFBC. - * - * The shape of the two inputs (inputY, inputUV) and output (AFBC) - * depends on the original images' shape ([batches, height, width, channels]). - * Both height and width shold follow 64 alignment rule. For example, if - * original height is 480, its 64 alignment should be 512. For Y plane, - * channel size should be 1; for UV plane, channel size should be 2. Besides, - * the height and width of UV plane should be half of Y's height and width. - * For AFBC output, its height shoud be 3/2 of Y's height, and its width - * equals to Y's width. Example: - * - * original_img.shape = [1, 384, 640, 3] - * inputY.shape = [1, 384, 640, 1] - * inputUV.shape = [1, 192, 320, 2] - * output.shape = [1, 576, 640, 1] - * - * Supported tensor {@link OperandCode}: - * * {@link NEURON_EXT_TENSOR_RAW} (for output) - * * {@link NEURON_TENSOR_QUANT8_ASYMM} (for inputY, inputUV) - * * {@link NEURON_TENSOR_QUANT16_ASYMM} (for inputY, inputUV) - * Note: - * If image mode is YUV420_8BITS, use NEURON_TENSOR_QUANT8_ASYMM; if mode is - * YUV420_10BITS, use NEURON_TENSOR_QUANT16_ASYMM. - * - * Tensor rank: both input and output require rank 4, with "NHWC" data layout. - * - * Inputs: - * * 0: inputY, a 4-D tensor. Tensor type can be either {@link - * NEURON_TENSOR_QUANT8_ASYMM} or {@link - * NEURON_TENSOR_QUANT16_ASYMM}, depends on YUV420 bit mode. - * * 1: inputUV, a 4-D tensor. Tensor type can be either {@link - * NEURON_TENSOR_QUANT8_ASYMM} or {@link - * NEURON_TENSOR_QUANT16_ASYMM}, depends on YUV420 bit mode. - * * 2: HeaderAlignment, an {@link NEURON_INT32} scalar, specifying - * the header alignment in AFBC format. - * * 3: xAlign, an {@link NEURON_INT32} scalar, specifying the frame - * width alignment of AFBC format. - * * 4: yAlign, an {@link NEURON_INT32} scalar, specifying the frame - * height alignment of AFBC format. - * * 5: xOffset, an {@link NEURON_INT32} scalar, specifying the frame - * width offset of AFBC format. - * * 6: yOffset, an {@link NEURON_INT32} scalar, specifying the frame - * height offset of AFBC format. - * * 7: mode, an {@link NEURON_INT32} scalar. Set to 0 for - * YUV420_8BITS. Set to 1 for YUV420_10BITS. Note that 8b, 10b here means the - * compressed bit width in AFBC frame, where the YUV420 must be 8b for - * AFBC_8b, and must be 16b for AFBC_10b. - * * 8: inPitchN, an {@link NEURON_INT32} scalar, specifying the - * YUV420 N-axis pitch. Must be set to 1, because only a single batch is - * supported for AfbcCompress. - * * 9: inPitchH, an {@link NEURON_INT32} scalar, specifying the - * YUV420 H-axis pitch. Set to the expected compressed image height. - * * 10: inPitchW, an {@link NEURON_INT32} scalar, specifying the - * YUV420 W-axis pitch. Set to the expected compressed image height. - * * 11: inPitchC, an {@link NEURON_INT32} scalar, specifying the - * YUV420 C-axis pitch. Set to 1 for interleaved YUV420. - * - * Outputs: - * * 0: output, a 4-D {@link NEURON_EXT_TENSOR_RAW} tensor. - * - * Available since NeuroPilot 7.0.0. - */ - NEURON_YUV420TOAFBC = 107, - NEURON_NUMBER_OF_OPERATIONS, -} NeuronOperationType; - -/** - * Fused activation function types. - */ -typedef enum { - // NO fused activation function. - NEURON_FUSED_NONE = 0, - // Fused ReLU activation function. - NEURON_FUSED_RELU = 1, - // Fused ReLU1 activation function. - NEURON_FUSED_RELU1 = 2, - // Fused ReLU6 activation function. - NEURON_FUSED_RELU6 = 3, -} NeuronAdapterFuseCode; - -/** - * Implicit padding algorithms. - */ -typedef enum { - /** - * SAME padding. - * Padding on both ends are the "same": - * padding_to_beginning = total_padding / 2 - * padding_to_end = (total_padding + 1)/2. - * i.e., for even number of padding, padding to both ends are exactly - * the same; for odd number of padding, padding to the ending is bigger - * than the padding to the beginning by 1. - * - * total_padding is a function of input, stride and filter size. - * It could be computed as follows: - * out_size = (input + stride - 1) / stride; - * needed_input = (out_size - 1) * stride + filter_size - * total_padding = max(0, needed_input - input_size) - * The computation is the same for the horizontal and vertical directions. - */ - NEURON_PADDING_SAME = 1, - - /** - * VALID padding. - * No padding. When the input size is not evenly divisible by - * the filter size, the input at the end that could not fill - * the whole filter tile will simply be ignored. - */ - NEURON_PADDING_VALID = 2, -} NeuronAdapterPaddingCode; - -/** - * Execution preferences. - */ -typedef enum { - /* Prefer executing in a way that minimizes battery drain. */ - NEURON_PREFER_LOW_POWER = 0, - /* Prefer executing as fast as possible. (more power consumption)*/ - NEURON_PREFER_FAST_SINGLE_ANSWER = 1, - /* Prefer maximizing the throughput of successive frames */ - NEURON_PREFER_SUSTAINED_SPEED = 2, - /* Prefer executing with turbo boost. (most power consumption) */ - NEURON_PREFER_TURBO_BOOST = 3, -} NeuronAdapterPreferenceCode; - -/** - * Relative execution priority. - */ -typedef enum { - NEURON_PRIORITY_LOW = 90, - NEURON_PRIORITY_MEDIUM = 100, - NEURON_PRIORITY_HIGH = 110, - NEURON_PRIORITY_DEFAULT = NEURON_PRIORITY_MEDIUM, -} NeuronAdapterPriorityCode; - -/** - * Compiler optimization hint. - */ -typedef enum { - /** - * Normal optimization. - * Available since 4.3.1 - */ - NEURON_OPTIMIZATION_NORMAL = 0, - /** - * Reduce latency by utilizing as many APU cores as possible. - * Available since 4.3.1 - */ - NEURON_OPTIMIZATION_LOW_LATENCY = 1 << 0, - /** - * Reducing DRAM access as more as possible. - * Available since 4.4.0 - */ - NEURON_OPTIMIZATION_DEEP_FUSION = 1 << 1, - /** - * Reduce latency by using as many APU cores as possible in batch-dimension. - * (For models with batch > 1) - * Available since 4.4.0 - */ - NEURON_OPTIMIZATION_BATCH_PROCESSING = 1 << 2, - /** - * Default optimization setting. - * Available since 4.3.1 - */ - NEURON_OPTIMIZATION_DEFAULT = NEURON_OPTIMIZATION_NORMAL, -} OptimizationCode; - -/** - * CPU cache flush hint. - */ -typedef enum { - /** - * Sync input buffer and invalidate output buffer. - * Available since 5.0.1 - */ - NEURON_CACHE_FLUSH_ENABLE_ALL = 0, - /** - * Disable sync input buffer. - * Available since 5.0.1 - */ - NEURON_CACHE_FLUSH_DISABLE_SYNC_INPUT = 1 << 0, - /** - * Disable invalidate output buffer. - * Available since 5.0.1 - */ - NEURON_CACHE_FLUSH_DISABLE_INVALIDATE_OUTPUT = 1 << 1, - /** - * Default cache flush setting. - * Available since 5.0.1 - */ - NEURON_CACHE_FLUSH_DEFAULT = NEURON_CACHE_FLUSH_ENABLE_ALL, -} CacheFlushCode; - -/** - * Compilation Type. - */ -typedef enum { - /* Normal Compilation Available since 7.0.0 */ - COMPILATION_TYPE_NORMAL = 0, - /* @deprecate */ - COMPILATION_TYPE_DEBUG_PLUS = 1, - /* Batched Execution: Set input/output from memory every time. - * Available since 7.0.0 - */ - COMPILATION_TYPE_BATCHED = 2, - /* One compilation with multi-executions could be created. - * Available since 7.0.0 - */ - COMPILATION_TYPE_MULTI_EXECUTIONS = 3, - /* Batched Execution: Set input/output from memory 1st time and memcpy next - * time. Available since 7.0.1 - */ - COMPILATION_TYPE_EXECUTION_CONTROLLER = 4, -} CompilationType; - -/** - * Supported Feature - */ -typedef enum { - NEURON_FEATURE_NONE = 0, - NEURON_THROUGHPUT_MODE = 1, -} NeuronFeatureType; - -/** - * The structure to represent the neuron version. - */ -typedef struct { - uint8_t major; ///< major version - uint8_t minor; ///< minor version - uint8_t patch; ///< patch version -} NeuronRuntimeVersion; - -/** - * Get the version of Neuron runtime library. - * - * @param version the version of Neuron runtime library. - * @return NEURON_NO_ERROR - */ -int Neuron_getVersion(NeuronRuntimeVersion* version); - -/** - * Get the supported status of feature. - * - * Available since 7.0.0 - * - * @param type input feature @NeuronFeatureType to check supported or not - * @param supported return the supported status - * @return NEURON_NO_ERROR if successful. - */ -int Neuron_getFeatureSupportedStatus(NeuronFeatureType type, bool* supported); - -/** - * Get the size of L1 memory in APU. - * - * Available since 4.3.0 - * - * @param sizeKb L1 memory size in KB - * @return NEURON_NO_ERROR if successful. - */ -int Neuron_getL1MemorySizeKb(uint32_t* sizeKb); - -/** - * Creates a shared memory object from a file descriptor. - * - * For ion descriptor, application should create the ion memory and descriptor - * first and then use it in this function. - * - * Available since 4.1.0 Only supports ion fd. - * - * @param size The requested size in bytes. Must not be larger than the file - * size. - * @protect The desired memory protection for the mapping. It is either - * PROT_NONE or the bitwise OR of one or more of the following flags: PROT_READ, - * PROT_WRITE. - * @fd The requested file descriptor. The file descriptor has to be mmap-able. - * @offset The offset to the beginning of the file of the area to map. - * @memory The memory object to be created. Set to NULL if unsuccessful. - */ -int NeuronMemory_createFromFd( - size_t size, - int protect, - int fd, - size_t offset, - NeuronMemory** memory); - -#ifdef __ANDROID__ -/** - * Creates a shared memory object from an AHardwareBuffer handle. - * - * We only support AHardwareBuffer with format AHARDWAREBUFFER_FORMAT_BLOB and - * it can only be used for Model inputs and outputs. - * - * The AHardwareBuffer with AHARDWAREBUFFER_FORMAT_BLOB format can be used the - * same way as shared memory created from a file handle. See NeuronMemory for - * description on how to use this shared memory. - * - * The provided AHardwareBuffer must outlive the NeuronMemory object. - * - * Available since 5.0.0 - * - * @param ahwb The AHardwareBuffer handle. - * @param memory The memory object to be created. - * Set to NULL if unsuccessful. - * - * @return NEURON_NO_ERROR if the request completed normally. - * - */ -int NeuronMemory_createFromAHardwareBuffer( - const AHardwareBuffer* ahwb, - NeuronMemory** memory); - -#else // __ANDROID__ - -/** - * Not supported at non-android platform - * - * @return NEURON_BAD_STATE - */ -int NeuronMemory_createFromAHardwareBuffer(); - -#endif - -/** - * Delete a memory object. - * - * For ion memory, this function cleans up the internal resource associated with - * this memory. Applications should clean up the allocated ion memory after this - * function. - * - * Available since 4.1.0 - */ -void NeuronMemory_free(NeuronMemory* memory); - -/** - * Create an empty NeuronModel. The model should be constructed with calls to - * NeuronModel_addOperation and NeuronModel_addOperand. - * - * Available since 4.1.0 - * - * @param model The NeuronModel to be created. Set to NULL if unsuccessful. - * @return NEURON_NO_ERROR if successful. - */ -int NeuronModel_create(NeuronModel** model); - -/** - * Destroy a model. The model need not have been finished by a call to - * NeuronModel_finish. - * - * Available since 4.1.0 - * - * @param model The model to be destroyed. - */ -void NeuronModel_free(NeuronModel* model); - -/** - * Indicate that we have finished modifying a model. Required before calling - * NeuronCompilation_compile. - * - * Available since 4.1.0 - * - * @param model The model to be finished. - * @return NEURON_NO_ERROR if successful. - */ -int NeuronModel_finish(NeuronModel* model); - -/** - * Add an operand to a model. The order in which the operands are added is - * important. The first one added to a model will have the index value 0, the - * second 1, etc. These indexes are used as operand identifiers in - * NeuronModel_addOperation. - * - * Available since 4.1.0 - * - * @param model The model to be modified. - * @param type The NeuronOperandType that describes the shape of the operand. - * Neither the NeuronOperandType nor the dimensions it points to need to outlive - * the call to NeuronModel_addOperand. - * @return NEURON_NO_ERROR if successful. - */ -int NeuronModel_addOperand(NeuronModel* model, const NeuronOperandType* type); - -/** - * Sets an operand to a constant value. - * Values of length smaller or equal to - * NEURON_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES are immediately copied into the - * model. For values of length greater than - * NEURON_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES, a pointer to the buffer is - * stored within the model. The application must not change the content of this - * region until all executions using this model have completed. As the data may - * be copied during processing, modifying the data after this call yields - * undefined results. - * - * Attempting to modify a model once NeuronModel_finish has been called will - * return an error. - * - * A special notice on the buffer lifetime when the length is greater than - * NEURON_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES. The provided buffer must - * outlive the compilation of this model. I.e. user must keep the buffer - * unchanged until NeuronCompilation_finish of this model. This is an internal - * optimization comparing to NNAPI. In NNAPI, NN runtime will copy the buffer to - * a shared memory between NN runtime and NNAPI HIDL service during - * ANNModel_finish, and it will be copied again to the compiled result during - * ANNCompilation_finish. In Neuron Adapter, there will be only one copying - * during NeuronCompilaiton_finish, so it is required to keep the buffer alive - * until NeuronCompilaiton_finish returned. - * - * Available since 4.1.0 - * - * @param model The model to be modified. - * @param index The index of the model operand we're setting. - * @param buffer A pointer to the data to use. - * @param length The size in bytes of the data value. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronModel_setOperandValue( - NeuronModel* model, - int32_t index, - const void* buffer, - size_t length); -/** - * Sets an operand to a value that is a reference to another NeuronModel. - * - * The referenced model must already have been finished by a call to - * NeuronModel_finish. - * - * The NeuronModel_relaxComputationFloat32toFloat16 setting of referenced models - * is overridden by that setting of the main model of a compilation. - * - * The referenced model must outlive the model referring to it. - * - * Attempting to modify a model once NeuronModel_finish has been called will - * return an error. - * - * Available since 4.1.0 - * - * @param model The model to be modified. - * @param index The index of the model operand we're setting. - * @param value The model to be referenced. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronModel_setOperandValueFromModel( - NeuronModel* model, - int32_t index, - const NeuronModel* value); - -/** - * Sets an operand's per channel quantization parameters - * Sets parameters required by a tensor of type - * NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL This function must be called for every - * tensor of type NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL before calling - * NeuronModel_finish - * - * Available since 4.1.0 - * - * @param model The model to be modified. - * @param index The index of the model operand we're setting. - * @param channelQuant The per channel quantization parameters for the operand. - * No memory in this struct needs to outlive the call to this function. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronModel_setOperandSymmPerChannelQuantParams( - NeuronModel* model, - int32_t index, - const NeuronSymmPerChannelQuantParams* channelQuant); - -/** - * Sets an operand's per channel quantization parameters - * Sets parameters required by a tensor of type - * NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL or - * NEURON_TENSOR_QUANT8_ASYMM_PER_CHANNEL. - * This function must be called for every tensor of type - * NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL or - * NEURON_TENSOR_QUANT8_ASYMM_PER_CHANNEL before calling NeuronModel_finish. - * - * Available since 6.0.0 - * - * @param model The model to be modified. - * @param index The index of the model operand we're setting. - * @param channelQuant The per channel quantization parameters(include - * per-channel offset) for the operand. No memory in this struct needs to - * outlive the call to this function. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronModel_setOperandPerChannelQuantParams( - NeuronModel* model, - int32_t index, - const NeuronPerChannelQuantParams* channelQuant); - -/** - * Add an operation to a model. - * The operands specified by inputs and outputs must have been previously added - * by calls to NeuronModel_addOperand. - * - * Available since 4.1.0 - * - * @param model The model to be modified. - * @param type The NeuronOperationType of the operation. - * @param inputCount The number of entries in the inputs array. - * @param inputs An array of indexes identifying each operand. - * @param outputCount The number of entries in the outputs array. - * @param outputs An array of indexes identifying each operand. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronModel_addOperation( - NeuronModel* model, - NeuronOperationType type, - uint32_t inputCount, - const uint32_t* inputs, - uint32_t outputCount, - const uint32_t* outputs); - -/** - * Add an operation extension to a model. - * The operands specified by inputs and outputs must have been previously added - * by calls to NeuronModel_addOperand. User needs to specify the operation - * extension name and the desired device which will execute the operation - * extension. - * - * Available since 4.1.0 - * - * @param model The model to be modified. - * @param name The name of the operation extension. - * @param vendor The name of the vendor which will implement the operation - * extension. - * @param device The device which will execute the operation extension. - * @param inputCount The number of entries in the inputs array. - * @param inputs An array of indexes identifying each operand. - * @param outputCount The number of entries in the outputs array. - * @param outputs An array of indexes identifying each operand. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronModel_addOperationExtension( - NeuronModel* model, - const char* name, - const char* vendor, - const NeuronDevice* device, - uint32_t inputCount, - const uint32_t* inputs, - uint32_t outputCount, - const uint32_t* outputs); - -/** - * Specfifies which operands will be the model's inputs and outputs. - * An operand cannot be used for both input and output. Doing so will return an - * error. - * - * The operands specified by inputs and outputs must have been - * previously added by calls to NeuronModel_addOperand. - * - * Attempting to modify a model once NeuronModel_finish has been - * called will return an error. - * - * Available since 4.1.0 - * - * @param model The model to be modified. - * @param inputCount The number of entries in the inputs array. - * @param inputs An array of indexes identifying the input operands. - * @param outputCount The number of entries in the outputs array. - * @param outputs An array of indexes identifying the output operands. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronModel_identifyInputsAndOutputs( - NeuronModel* model, - uint32_t inputCount, - const uint32_t* inputs, - uint32_t outputCount, - const uint32_t* outputs); - -/** - * Gets the supported operations in a model. - * This function must be called after calling NeuronModel_finish - * - * Available since 4.1.0 - * - * @param model The model to be queried. - * @param supported The boolean array to be filled. True means supported. The - * size of the boolean array must be at least as large as the number of - * operations in the model. The order of elements in the supported array matches - * the order in which the corresponding operations were added to the model. - * @param operationCount number of operations in the model - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronModel_getSupportedOperations( - NeuronModel* model, - bool* supported, - uint32_t operationCount); - -/** - * Get the supported operations for a specified set of devices. - * If multiple devices are selected, the supported operation list is a union of - * supported operations of all selected devices. - * - * Available since 4.1.0 - * - * @param model The model to be queried. - * @param devices Selected devices - * @param numDevices Number of selected devices - * @param supportedOps The boolean array to be filled. True means supported. The - * size of the boolean array must be as least as large as the number of - * operations in the model. The order of elements in the supportedOps array - * matches the order in which the corresponding operations were added to the - * model. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronModel_getSupportedOperationsForDevices( - const NeuronModel* model, - const NeuronDevice* const* devices, - uint32_t numDevices, - bool* supportedOps); - -/** - * Specifies whether NEURON_TENSOR_FLOAT32 is allowed to be calculated with - * range and/or precision as low as that of the IEEE 754 16-bit floating-point - * format. By default, NEURON_TENSOR_FLOAT32 must be calculated using at least - * the range and precision of the IEEE 754 32-bit floating-point format. - * - * Available since 4.1.0 - * - * @param model The model to be modified. - * @param allow 'true' indicates NEURON_TENSOR_FLOAT32 may be calculated with - * range and/or precision as low as that of the IEEE 754 16-bit floating point - * format. 'false' indicates NEURON_TENSOR_FLOAT32 must be calculated using at - * least the range and precision of the IEEE 754 32-bit floating point format. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronModel_relaxComputationFloat32toFloat16( - NeuronModel* model, - bool allow); - -/** - * Hint compiler to suppress the input data conversion, the users have to - * convert the input data into platform-expected format before inference. - * - * Available since 4.2.0 - * - * @param model The model to be modified. - * @param suppress True to suppress the input data conversion. - * @return NEURON_NO_ERROR if successful. - */ -int NeuronModel_suppressInputConversion(NeuronModel* model, bool suppress); - -/** - * Hint compiler to suppress the output data conversion, the users have to - * convert the output data from platform-generated format before inference. - * - * Available since 4.2.0 - * - * @param model The model to be modified. - * @param suppress True to suppress the output data conversion. - * @return NEURON_NO_ERROR if successful. - */ -int NeuronModel_suppressOutputConversion(NeuronModel* model, bool suppress); - -/** - * Restore the compiled network using user provided buffer. - * - * The restored NeuronCompilaton could be used in creating executing instance. - * The restored NeuronModel cannot be recompiled. - * - * Available since 4.3.0 - * - * @param model Restored model. - * @param compilation Restored compilation - * @param buffer User provided buffer to restore the compiled network. - * @param size Size of the user provided buffer in bytes. - * @return NEURON_NO_ERROR if compiled network is successfully copied to the - * user allocated buffer. NEURON_BAD_DATA if it fails to load the compiled - * network, this could either be the version is not matched or the data is - * corrupted. - */ -int NeuronModel_restoreFromCompiledNetwork( - NeuronModel** model, - NeuronCompilation** compilation, - const void* buffer, - const size_t size); - -/** - * Restore the compiled network using user provided buffer. - * Support multiple compilation type; choices are: COMPILATION_TYPE_BATCHED, - * COMPILATION_TYPE_EXECUTION_CONTROLLER, COMPILATION_TYPE_EXECUTION_CONTROLLER, - * and COMPILATION_TYPE_NORMAL. - * - * There are two ways to use Batched Compilation: - * 1) load from DLA. - * 2) create batched compilation directly. - * To load DLA, one should call NeuronCompilation_create and - * NeuronModel_restoreFromCompiledNetworkV2. To create directly, one should call - * NeuronCompilation_createForBatch. - * - * The restored NeuronCompilaton could be used in creating executing instance. - * The restored NeuronModel cannot be recompiled. - * - * Available since 7.0.0 - * - * @param model Restored model. - * @param compilation Restored compilation - * @param buffer User provided buffer to restore the compiled network. - * @param size Size of the user provided buffer in bytes. - * @param type Type of the compilation needed to be restored. - * @return NEURON_NO_ERROR if compiled network is successfully copied to the - * user allocated buffer. NEURON_BAD_DATA if it fails to load the compiled - * network, this could either be the version is not matched or the data is - * corrupted. - */ -int NeuronModel_restoreFromCompiledNetworkV2( - NeuronModel** model, - NeuronCompilation** compilation, - const void* buffer, - const size_t size, - const CompilationType& type); - -/** - * Set a string into model that can be used for recognition for user. - * It's only used for debug, the string can be dumped into log and make users - * check the model behavior easily. - * - * Available since 7.0.0 - * - * @param model The model to be modified. - * @param name The string, user can free buffer 'name' after calling this API. - * @return NEURON_NO_ERROR if the string is set success. NEURON_UNEXPECTED_NULL - * if the input param is nullptr. - */ -int NeuronModel_setName(NeuronModel* model, const char* name); - -/** - * Create a NeuronCompilation to compile the given model. - * - * This function only creates the object. Compilation is only performed once - * NeuronCompilation_finish is invoked. NeuronCompilation_finish should be - * called once all desired properties have been set on the compilation. - * NeuronModel_free should be called once the compilation is no longer needed. - * The provided model must outlive the compilation. The model must already have - * been finished by a call to NeuronModel_finish. - * - * Available since 4.1.0 - * - * @param model The NeuronModel to be compiled. - * @param compilation The newly created object or NULL if unsuccessful. - * - * @return NEURON_NO_ERROR if successful - */ -int NeuronCompilation_create( - NeuronModel* model, - NeuronCompilation** compilation); - -/** - * Create a NeuronCompilation with different purpose to compile the given model. - * - * This function only creates the object. Compilation is only performed once - * NeuronCompilation_finish is invoked. NeuronCompilation_finish should be - * called once all desired properties have been set on the compilation. - * NeuronModel_free should be called once the compilation is no longer needed. - * The provided model must outlive the compilation. The model must already have - * been finished by a call to NeuronModel_finish. - * - * Available since 7.0.1 - * - * @param model The NeuronModel to be compiled. - * @param type Type of the compilation needed to be created. - * @param options The options which used to create with compilation. - * @param compilation The newly created object or NULL if unsuccessful. - * - * @return NEURON_NO_ERROR if successful - */ -int NeuronCompilation_createV2( - NeuronModel* model, - CompilationType type, - const char* options, - NeuronCompilation** compilation); - -/** - * Destroy a compilation. - * - * Available since 4.1.0 - * - * @param compilation The compilation to be destroyed. - */ -void NeuronCompilation_free(NeuronCompilation* compilation); - -/** - * Compilation is finished once NeuronCompilation_finish is invoked. Required - * before calling NeuronExecution_create. This function must only be called once - * for a given compilation. - * - * Available since 4.1.0 - * - * @param compilation The compilation to be finished. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronCompilation_finish(NeuronCompilation* compilation); - -/** - * Gets the supported operations in a model with specific optimized configures. - * This function must be called before calling NeuronCompilation_finish. - * - * Available since 7.0.0 - * - * @param compilation The compilation to be queried. - * @param operationCount number of operations in the model - * @param supported The boolean array to be filled. True means supported. The - * size of the boolean array must be at least as large as the number of - * operations in the model. The order of elements in the supported array matches - * the order in which the corresponding operations were added to the model. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronCompilation_getSupportedOperations( - NeuronCompilation* compilation, - uint32_t operationCount, - bool* supported); - -/** - * Provides optional caching information for faster re-compilation. - * - * Available since 4.1.0 - * - * @param compilation The compilation to be cached. - * @param cacheDir The cache directory for storing and retrieving caching data. - * The user should choose a directory local to the application, and is - * responsible for managing the cache entries. - * @param token The token provided by the user to specify a model must be of - * length NEURON_BYTE_SIZE_OF_CACHE_TOKEN. The user should ensure that the token - * is unique to a model within the application. Neuron cannot detect token - * collisions; a collision will result in a failed execution or in a successful - * execution that produces incorrect output values. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronCompilation_setCaching( - NeuronCompilation* compilation, - const char* cacheDir, - const uint8_t* token); - -/** - * Hint compiler with the size of L1 memory, this value should not be larger - * than real platform's settings. The user can get the platform's L1 memory size - * in KB by calling Neuron_getL1MemorySizeKb. - * - * Available since 4.3.0 - * - * @param compilation The compilation to be modified. - * @param sizeKb L1 memory size in KB. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronCompilation_setL1MemorySizeKb( - NeuronCompilation* compilation, - uint32_t sizeKb); - -/** - * Create a NeuronCompilation to compile the given model for a specified set of - * devices. The user must handle all compilation and execution failures from the - * specified set of devices. This is in contrast to a use of - * NeuronCompilation_create, where neuron will attempt to recover from such - * failures. - * - * Available since 4.1.0 - * - * @param model The NeuronModel to be compiled. - * @param devices The set of devices. Must not contain duplicates. - * @param numDevices The number of devices in the set. - * @param compilation The newly created object or NULL if unsuccessful. - * - * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the model is - * invalid. - */ -int NeuronCompilation_createForDevices( - NeuronModel* model, - const NeuronDevice* const* devices, - uint32_t numDevices, - NeuronCompilation** compilation); - -/** - * Create a NeuronCompilation. Which can divide one graph into several subgraph - * and use the information to debug. - * - * Only be used in debug purpose, no guarantees performance and thread safe. - * - * Available since 5.0.0 - * - * @param model The NeuronModel to be compiled. - * @param compilation The newly created object or NULL if unsuccessful. - * - * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the model is - * invalid. - */ -int NeuronCompilation_createForDebug( - NeuronModel* model, - NeuronCompilation** compilation); - -/** - * Sets the execution preference associated with this compilation. - * - * Default value of preference is PREFER_SINGLE_FAST_ANSWER - * - * Available since 4.1.0 - * - * @param compilation The compilation to be modified. - * @param preference Either NEURON_PREFER_LOW_POWER, - * NEURON_PREFER_SINGLE_FAST_ANSWER, or NEURON_PREFER_SUSTAINED_SPEED. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronCompilation_setPreference( - NeuronCompilation* compilation, - int32_t preference); - -/** - * Sets the execution priority associated with this compilation. - * - * Execution priorities are relative to other executions created by the same - * application (specifically same uid) for the same device. Specifically, - * priorities of executions from one application will not affect executions from - * another application. - * - * Higher priority executions may use more compute resources than lower priority - * executions, and may preempt or starve lower priority executions. - * - * Available since 4.1.0 - * - * @param compilation The compilation to be modified. - * @param priority The relative priority of the execution compared to other - * executions created by the application. Must be one of NEURON_PRIORITY_*. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronCompilation_setPriority(NeuronCompilation* compilation, int priority); - -/** - * Get the padded dimensional information of the specified input operand of the - * compilation. This function must be called after calling - * NeuronCompilation_finish. If NeuronModel_suppressInputConversion was not - * applied to the model to be compiled, the returned dimensions are the padded - * dimension after NeuronCompilation_finish to satisfy the optimization - * requirement from the underlying hardware accelerators. - * If NeuronModel_suppressInputConversion was applied to the model to be - * compiled, the returned dimensions are the same as the original dimensions - * given from user. - * - * Available since 4.2.0 - * - * @param compilation The compilation to be queried. - * @param index The index of the input operand we are querying. It is an index - * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the - * index associated with NeuronModel_addOperand. - * @param dimensions The dimension array to be filled. The size of the array - * must be exactly as large as the rank of the input operand to be queried in - * the model. - * @return NEURON_NO_ERROR if successful. - */ -int NeuronCompilation_getInputPaddedDimensions( - NeuronCompilation* compilation, - int32_t index, - uint32_t* dimensions); - -/** - * Get the padded dimensional information of the specified output operand of the - * compilation. This function must be called after calling - * NeuronCompilation_finish. If NeuronModel_suppressOutputConversion was not - * applied to the model to be compiled, the returned dimensions are the padded - * dimension after NeuronCompilation_finish to satisfy the optimization - * requirement from the underlying hardware accelerators. - * If NeuronModel_suppressOutputConversion was applied to the model to be - * compiled, the returned dimensions are the same as the original dimensions - * given from user. - * - * Available since 4.2.0 - * - * @param compilation The compilation to be queried. - * @param index The index of the output operand we are querying. It is an index - * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the - * index associated with NeuronModel_addOperand. - * @param dimensions The dimension array to be filled. The size of the array - * must be exactly as large as the rank of the output operand to be queried in - * the model. - * @return NEURON_NO_ERROR if successful. - */ -int NeuronCompilation_getOutputPaddedDimensions( - NeuronCompilation* compilation, - int32_t index, - uint32_t* dimensions); - -/** - * Get the expected buffer size (bytes) of the specified input operand of the - * compilation. If NeuronModel_suppressInputConversion was not applied to the - * model to be compiled, the returned size are the padded size after - * NeuronCompilation_finish to satisfy the optimization requirement from the - * underlying hardware accelerators. If NeuronModel_suppressInputConversion was - * applied to the model to be compiled, the returned size are the same as the - * original size given from user. - * - * Available since 4.2.0 - * - * @param compilation The compilation to be queried. - * @param index The index of the input operand we are querying. It is an index - * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the - * index associated with NeuronModel_addOperand. - * @param size the expected buffer size in bytes. - * @return NEURON_NO_ERROR if successful. - */ -int NeuronCompilation_getInputPaddedSize( - NeuronCompilation* compilation, - int32_t index, - size_t* size); - -/** - * Get the expected buffer size (bytes) of the specified output operand of the - * compilation. If NeuronModel_suppressOutputConversion was not applied to the - * model to be compiled, the returned size are the padded size after - * NeuronCompilation_finish to satisfy the optimization requirement from the - * underlying hardware accelerators. If NeuronModel_suppressOutputConversion was - * applied to the model to be compiled, the returned size are the same as the - * original size given from user. - * - * Available since 4.2.0 - * - * @param compilation The compilation to be queried. - * @param index The index of the output operand we are querying. It is an index - * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the - * index associated with NeuronModel_addOperand. - * @param size the expected buffer size in bytes. - * @return NEURON_NO_ERROR if successful. - */ -int NeuronCompilation_getOutputPaddedSize( - NeuronCompilation* compilation, - int32_t index, - size_t* size); - -/** - * Get the compiled network size of the compilation. - * - * This must be called after NeuronCompilation_finished and before - * NeuronExecution_create. It is not allowed to call this with a compilation - * restored from cache. - * - * Available since 4.3.0 - * - * @param compilation The compilation to be queried. - * @param size The compiled network size in bytes. - * @return NEURON_NO_ERROR if successful. - */ -int NeuronCompilation_getCompiledNetworkSize( - NeuronCompilation* compilation, - size_t* size); - -/** - * Store the compiled network. - * - * Users have to allocate the buffer with the specified size before calling this - * function. - * - * This must be called after NeuronCompilation_finished and before - * NeuronExecution_create. It is not allowed to call this with a compilation - * restored from cache. - * - * Available since 4.3.0 - * - * @param compilation The compilation to be queried. - * @param buffer User allocated buffer to store the compiled network. - * @param size Size of the user allocated buffer in bytes. - * @return NEURON_NO_ERROR if compiled network is successfully copied to the - * user allocated buffer. - */ -int NeuronCompilation_storeCompiledNetwork( - NeuronCompilation* compilation, - void* buffer, - const size_t size); -/** - * Hint the compiler to apply the optimization strategy according to the user - * specified parameters. - * - * Available since 4.3.0 - * - * @param compilation The compilation to be modified. - * @param optimizationCode User specified optimization strategy. Must be one of - * NEURON_OPTIMIZATION_* or the inclusive OR value of multiple - * NEURON_OPTIMIZATION_*. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronCompilation_setOptimizationHint( - NeuronCompilation* compilation, - uint32_t optimizationCode); - -/** - * Hint the compiler to apply the optimization strategy according to the user - * specified arguments in a null-terminated string. - * - * Available since 4.6.0 - * - * @param compilation The compilation to be modified. - * @param optimizationString A null-terminated string to represent the user - * specified optimization strategy. - * @return NEURON_NO_ERROR if successful. - */ -int NeuronCompilation_setOptimizationString( - NeuronCompilation* compilation, - const char* optimizationString); - -/** - * Only allow users' optimization string(from - * NeuronCompilation_setOptimizationString), the system won't set any compiler - * options for them. - * - * Available since 6.0.5 - * - * @param compilation The compilation to be modified. - * @param allow Allow only use user's setting or not. - * strategy. - * @return NEURON_NO_ERROR if successful. - */ -int NeuronCompilation_setOnlyAllowOptimizationString( - NeuronCompilation* compilation, - bool allow); - -/** - * Get the compiler hints which are used to apply the optimization strategy - * according to the user specified arguments in a null-terminated string. - * - * Available since 6.0.5 - * - * @param compilation The compilation to be modified. - * @param optimizationString A null-terminated string to represent the user - * specified optimization strategy. - * @return NEURON_NO_ERROR if successful. - */ -int NeuronCompilation_getOptimizationString( - NeuronCompilation* compilation, - const char** optimizationString); - -/** - * Hint compiler to trim the model IO alignment. - * - * Available since 4.4.8 - * - * @param compilation The compilation to be modified. - * @param enable 'true' for trimming model IO alignment. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronCompilation_setTrimIOAlignment( - NeuronCompilation* compilation, - bool enable); - -/** - * Hint compiler to use software dilated convolution - * - * Available since 4.4.8 - * - * @param compilation The compilation to be modified. - * @param enable 'true' indicates a hint to compiler to use software dilated - * convolution - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronCompilation_setSWDilatedConv( - NeuronCompilation* compilation, - bool enable); - -/** - * Create a new execution instance by calling the NeuronExecution_create - * function. The provided compilation must outlive the execution. - * - * Available since 4.1.0 - * - * @param compilation The NeuronCompilation to be evaluated. - * @param execution The newly created object or NULL if unsuccessful. - * - * @return NEURON_NO_ERROR if successful - */ -int NeuronExecution_create( - NeuronCompilation* compilation, - NeuronExecution** execution); - -/** - * Destroy an execution. - * - * Available since 4.1.0 - * - * @param execution The execution to be destroyed. - */ -void NeuronExecution_free(NeuronExecution* execution); - -/** - * Associate a user buffer with an input of the model of the NeuronExecution. - * The provided buffer must outlive the execution. - * - * Available since 4.1.0 - * - * @param execution The execution to be modified. - * @param index The index of the input argument we are setting. It is an index - * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the - * index associated with NeuronModel_addOperand. - * @param type The NeuronOperandType of the operand. Currently NeuronAdapter - * only takes NULL. - * @param buffer The buffer containing the data. - * @param length The length in bytes of the buffer. - * - * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the name is not - * recognized or the buffer is too small for the input. - */ -int NeuronExecution_setInput( - NeuronExecution* execution, - int32_t index, - const NeuronOperandType* type, - const void* buffer, - size_t length); - -/** - * Associate a user buffer with an output of the model of the NeuronExecution. - * The provided buffer must outlive the execution. - * - * Available since 4.1.0 - * - * @param execution The execution to be modified. - * @param index The index of the output argument we are setting. It is an index - * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the - * index associated with NeuronModel_addOperand. - * @param type The NeuronOperandType of the operand. Currently NeuronAdapter - * only takes NULL. - * @param buffer The buffer where the data is to be written. - * @param length The length in bytes of the buffer. - * - * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the name is not - * recognized or the buffer is too small for the output. - */ -int NeuronExecution_setOutput( - NeuronExecution* execution, - int32_t index, - const NeuronOperandType* type, - void* buffer, - size_t length); - -/** - * Associate part of a memory object with an input of the model of the - * NeuronExecution. - * - * The provided memory must outlive the execution and should not be changed - * during computation. - * - * Available since 4.1.0 - * - * @param execution The execution to be modified. - * @param index The index of the input argument we are setting. It is an index - * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the - * index associated with Neuronodel_addOperand. - * @param type The NeuronOperandType of the operand. Currently NueronAdapter - * only takes NULL. - * @param memory The memory containing the data. - * @param offset This specifies the location of the data within the memory. The - * offset is in bytes from the start of memory. - * @param length The size in bytes of the data value. - * - * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the name is not - * recognized or the buffer is too small for the input. - */ -int NeuronExecution_setInputFromMemory( - NeuronExecution* execution, - uint32_t index, - const NeuronOperandType* type, - const NeuronMemory* memory, - size_t offset, - size_t length); - -/** - * Associate part of a memory object with an output of the model of the - * NeuronExecution. - * - * The provided memory must outlive the execution and should not be changed - * during computation. - * - * Available since 4.1.0 - * - * @param execution The execution to be modified. - * @param index The index of the output argument we are setting. It is an index - * into the lists passed to NeuronModel_identifyInputsAndOutputs. It is not the - * index associated with Neuronodel_addOperand. - * @param type The NeuronOperandType of the operand. Currently NueronAdapter - * only takes NULL. - * @param memory The memory containing the data. - * @param offset This specifies the location of the data within the memory. The - * offset is in bytes from the start of memory. - * @param length The size in bytes of the data value. - * - * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the name is not - * recognized or the buffer is too small for the input. - */ -int NeuronExecution_setOutputFromMemory( - NeuronExecution* execution, - uint32_t index, - const NeuronOperandType* type, - const NeuronMemory* memory, - size_t offset, - size_t length); - -/** - * Schedule synchronous evaluation of the execution. - * Returns once the execution has completed and the outputs are ready to be - * consumed. - * - * Available since 4.1.0 - * - * @param execution The execution to be scheduled and executed. - * - * @return NEURON_NO_ERROR if the execution completed normally. NEURON_BAD_STATE - * if the inference fails. Add two return code since 5.0.0 - * (NEURON_MISSED_DEADLINE_TRANSIENT if inference timeout, and - * NEURON_OUTPUT_INSUFFICIENT_SIZE if given outsize is not sufficient for real - * output) - * - */ -int NeuronExecution_compute(NeuronExecution* execution); - -/** - * Schedule asynchronous evaluation of the execution with dependencies. - * - * The execution will wait for all the depending events to be signaled before - * starting the evaluation. Once the execution has completed and the outputs - * are ready to be consumed, the returned event will be signaled. Depending on - * which devices are handling the execution, the event could be backed by a sync - * fence. Use NeuronEvent_wait to wait for that event. - * - * NeuronEvent_wait must be called to recurperate the resources used by the - * execution. - * - * If parts of the execution are scheduled on devices that do not support fenced - * execution, the function call may wait for such parts to finish before - * returning. - * - * The function will return an error if any of the events in dependencies is - * already in a bad state. After the execution is scheduled, if any of the - * events in dependencies does not complete normally, the execution will fail, - * and NeuronEvent_wait on the returned event will return an error. - * - * The function will return an error if any of the execution outputs has a - * tensor operand type that is not fully specified. - * - * @param execution The execution to be scheduled and executed. - * @param dependencies A set of depending events. The actual evaluation will not - * start until all the events are signaled. - * @param num_dependencies The number of events in the dependencies set. - * @param duration currently not used - * @param event The event that will be signaled on completion. event is set to - * NULL if there's an error. - * - * @return NEURON_NO_ERROR if the evaluation is successfully scheduled. - * - * Available since 5.0.0 - */ -int NeuronExecution_startComputeWithDependencies( - NeuronExecution* execution, - const NeuronEvent* const* dependencies, - uint32_t num_dependencies, - uint64_t duration, - NeuronEvent** event); - -/** - * Set the maximum duration of WHILE loops in the specified execution. - * - * @param execution The execution to be modified. - * @param duration The maximum amount of time in nanoseconds. - * @return NEURON_NO_ERROR if successful. - * - * Available since 5.0.0 - */ -int NeuronExecution_setLoopTimeout( - NeuronExecution* execution, - uint64_t duration); - -/** - * Get the default timeout value for WHILE loops. - * - * @return The default timeout value in nanoseconds. - * - * Available since 5.0.0 - */ -uint64_t Neuron_getDefaultLoopTimeout(); - -/** - * Get the maximum timeout value for WHILE loops. - * - * @return The maximum timeout value in nanoseconds. - * - * Available since 5.0.0 - */ -uint64_t Neuron_getMaximumLoopTimeout(); - -/** - * Sets the execution boost hint associated with this execution. Required before - * calling NeuronExecution_compute. - * - * Execution boost is the hint for the device frequency, ranged between 0 - * (lowest) to 100 (highest). For the compilation with preference set as - * NEURON_PREFER_SUSTAINED_SPEED, scheduler guarantees that the executing boost - * value would equal to the boost value hint. - * - * On the other hand, for the compilation with preference set as - * NEURON_PREFER_LOW_POWER, scheduler would try to save power by configuring the - * executing boost value with some value that is not higher than the boost value - * hint. - * - * Available since 4.1.0 - * - * @param execution The execution to be modified. - * @param boostValue The hint for the device frequency, ranged between 0 - * (lowest) to 100 (highest). - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronExecution_setBoostHint( - NeuronExecution* execution, - uint8_t boostValue); - -/** - * Sets the execution CPU cache flush hint associated with this execution. - * Required before calling NeuronExecution_setInputFromMemory and - * NeuronExecution_setOutputFromMemory. - * - * Default value of preference is NEURON_CACHE_FLUSH_ENABLE_ALL - * - * Available since 5.0.1 - * - * @param execution The execution to be modified. - * @param hint It is either NEURON_CACHE_FLUSH_ENABLE_ALL or the bitwise OR - * of one or more of the following flags: NEURON_CACHE_FLUSH_DISABLE_SYNC_INPUT, - * NEURON_CACHE_FLUSH_DISABLE_INVALIDATE_OUTPUT. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronExecution_setCacheFlushHint( - NeuronExecution* execution, - uint8_t flushHint); - -/** - * Get the dimensional information of the specified output operand of the model - * of the latest computation evaluated on {@link NeuronExecution}. - * - * This function may only be invoked when the execution is in the completed - * state. - * - * Available since 5.0.0 - * - * @param execution The execution to be queried. - * @param index The index of the output argument we are querying. It is - * an index into the lists passed to {@link - * NeuronModel_identifyInputsAndOutputs}. - * @param rank The rank of the output operand. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronExecution_getOutputOperandRank( - NeuronExecution* execution, - int32_t index, - uint32_t* rank); - -/** - * Get the dimensional information of the specified output operand of the model - * of the latest computation evaluated on {@link NeuronExecution}. The target - * output operand cannot be a scalar. - * - * This function may only be invoked when the execution is in the completed - * state. - * - * Available since 5.0.0 - * - * @param execution The execution to be queried. - * @param index The index of the output argument we are querying. It is - * an index into the lists passed to {@link - * NeuronModel_identifyInputsAndOutputs}. - * @param dimensions The dimension array to be filled. The size of the array - * must be exactly as large as the rank of the output operand to be queried in - * the model. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronExecution_getOutputOperandDimensions( - NeuronExecution* execution, - int32_t index, - uint32_t* dimensions); - -/** - * Create a NeuronCompilation which can create executions with shared static - * memory. - * - * This function only creates the object. Compilation is only performed once - * NeuronCompilation_finish is invoked. NeuronCompilation_finish should be - * called once all desired properties have been set on the compilation. - * NeuronModel_free should be called once the compilation is no longer needed. - * The provided model must outlive the compilation. The model must already have - * been finished by a call to NeuronModel_finish. - * - * Available since 7.0.0 - * - * @param model The NeuronModel to be compiled. - * @param compilation The newly created object or NULL if unsuccessful. - * - * @return NEURON_NO_ERROR if successful - */ -int NeuronCompilation_createForBatch( - NeuronModel* model, - NeuronCompilation** compilation); - -/** - * Set the size of runner pool, and create same number of runners. - * - * The execution must created by the following steps: - * NeuronCompilation_createForBatch, NeuronCompilation_finish, - * NeuronExecution_create. - * - * The execution created from this compilation has to use - * NeuronExecution_setRunnerPoolSize to create thread pool and then set a series - * of inputs & outputs into the execution. The execution will inference with the - * series of inputs. - * - * Available since 7.0.0 - * - * @param execution The NeuronExecution to be utilized. - * @param numRunners The number of runner need to be created. - * - * @return NEURON_NO_ERROR if successful - * @return NEURON_BAD_STATE if the compilation is not created via - * NeuronCompilation_createForBatch. - */ -int NeuronExecution_setRunnerPoolSize( - NeuronExecution* execution, - uint8_t numRunners); - -/** - * Notify the execution that all inputs / outputs have been set. - * Should be called after NeuronExecution_setInputFromMemory and - * NeuronExecution_setOutputFromMemory. - * - * The execution must created by the following steps: - * NeuronCompilation_createForBatch, NeuronCompilation_finish, - * NeuronExecution_create. - * - * Available since 7.0.0 - * - * @param execution The NeuronExecution to be utilized. - * - * @return NEURON_NO_ERROR if successful - * @return NEURON_BAD_STATE if the compilation is not created via - * NeuronCompilation_createForBatch. - */ -int NeuronExecution_setBatchDone(NeuronExecution* execution); - -/** - * Notify the execution that all inputs / outputs have been set. - * Should be called after NeuronExecution_setInputFromMemory and - * NeuronExecution_setOutputFromMemory. - * - * The execution must created by the following steps: - * 1. NeuronCompilation_createV2 with COMPILATION_TYPE_EXECUTION_CONTROLLER - * 2. NeuronCompilation_finish - * 3. NeuronExecution_create. - * or - * 1. NeuronModel_restoreFromCompiledNetworkV2 with - * COMPILATION_TYPE_EXECUTION_CONTROLLER - * 2. NeuronExecution_create. - * - * Available since 7.0.1 - * - * @param execution The NeuronExecution to be utilized. - * @param idx The index of runner to set the previous inputs and outputs. - * - * @return NEURON_NO_ERROR if successful - * @return NEURON_BAD_STATE if the compilation is not created via - * COMPILATION_TYPE_EXECUTION_CONTROLLER. - */ -int NeuronExecution_setIODone(NeuronExecution* execution, int idx); - -/** - * Create a NeuronCompilation which can create executions with shared static - * memory. - * - * This function only creates the object. Compilation is only performed once - * NeuronCompilation_finish is invoked. NeuronCompilation_finish should be - * called once all desired properties have been set on the compilation. - * NeuronModel_free should be called once the compilation is no longer needed. - * The provided model must outlive the compilation. The model must already have - * been finished by a call to NeuronModel_finish. - * - * The executions created from this compilation can be executed at the same - * time. - * - * Available since 7.0.0 - * - * @param model The NeuronModel to be compiled. - * @param compilation The newly created object or NULL if unsuccessful. - * - * @return NEURON_NO_ERROR if successful - */ -int NeuronCompilation_createForMultiExecutions( - NeuronModel* model, - NeuronCompilation** compilation); - -/** - * Set report path for debug plus. - * - * Only be used in debug purpose, the execution should be created by - * NeuronCompilation_createForDebug compilation. - * - * Available since 5.0.0 - * - * @param model The model need to be debug. - * @param path The path of execution report. - * - * @return NEURON_NO_ERROR if successful, NEURON_BAD_DATA if the path is empty. - */ -int NeuronDebug_setReportPath(NeuronModel* model, const char* path); - -/** - * Get the number of available devices. - * - * Available since 4.1.0 - * @param numDevices The number of devices returned. - * - * @return NEURON_NO_ERROR if successful. - */ -int Neuron_getDeviceCount(uint32_t* numDevices); - -/** - * Get the representation of the specified device. - * - * Available since 4.1.0 - * - * @param devIndex The index of the specified device. Must be less than the - * number of available devices. - * @param device The representation of the specified device. The same - * representation will always be returned for the specified device. - * - * @return NEURONNO_ERROR if successful. - */ -int Neuron_getDevice(uint32_t devIndex, NeuronDevice** device); - -/** - * Get the name of the specified device. - * - * Available since 4.1.0 - * - * @param device The representation of the specified device. - * @param name The returned name of the specified device. The name will remain - * valid for the duration of the application. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronDevice_getName(const NeuronDevice* device, const char** name); - -/** - * Get the description of the specified device. - * - * Available since 5.0.0 - * - * @param device The representation of the specified device. - * @param description The returned description of the specified device. The - * description will remain valid for the duration of the application. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronDevice_getDescription( - const NeuronDevice* device, - const char** description); - -/* - * Destroys the event. - * - * See NeuronExecution for information on multithreaded usage. - * - * Available since 5.0.0 - * - * @param event The event object to be destroyed. Passing NULL is acceptable and - * results in no operation. - */ -void NeuronEvent_free(NeuronEvent* event); - -/* - * Force destroys the event without calling NeuronEvent_wait(). - * If user wants do wait before destroying the event, they should use - * NeuronEvent_free. - * - * See NeuronExecution for information on multithreaded usage. - * - * Available since 6.0.0 - * - * @param event The event object to be destroyed. Passing NULL is acceptable and - * results in no operation. - */ -void NeuronEvent_freeForce(NeuronEvent* event); - -/** - * Waits until the execution completes. - * - * More than one thread can wait on an event. When the execution completes, - * all threads will be released. - * - * SeeNeuronExecution for information on multithreaded usage. - * - * Available since 5.0.0 - * - * @param event The event that will be signaled on completion. - * @return NEURON_NO_ERROR if the execution completed normally. - * NEURON_UNMAPPABLE if the execution input or output memory cannot - * be properly mapped. - */ -int NeuronEvent_wait(NeuronEvent* event); - -/** - * Create a NeuronEventfrom a sync_fence file descriptor. - * - * The newly created NeuronEvent does not take ownership of the provided - * sync_fence_fd, it will instead dup the provided sync_fence_fd and own the - * duplicate. - * - * @param sync_fence_fd The sync_fence file descriptor. - * @param event The newly created object or NULL if unsuccessful. - * - * @return NEURON_NO_ERROR if successful. - * - * Available since 5.0.0 - */ -int NeuronEvent_createFromSyncFenceFd(int sync_fence_fd, NeuronEvent** event); - -/** - * Get sync_fence file descriptor from the event. - * - * If the NeuronEvent is not backed by a sync fence, the sync_fence_fd - * will be set to -1, and NEURON_BAD_DATA will be returned. - * - * See NeuronEvent_createFromSyncFenceFd and - * NeuronExecution_startComputeWithDependencies to see how to create an event - * backed by a sync fence. - * - * The user takes ownership of the returned fd, and must close the returned file - * descriptor when it is no longer needed. - * - * @param event An event that is backed by a sync fence. - * @param sync_fence_fd The sync_fence file descriptor. The file descriptor will - * be set to -1 if there is an error. - * - * @return NEURON_NO_ERROR if successful. - * - * Available since 5.0.0 - */ -int NeuronEvent_getSyncFenceFd(const NeuronEvent* event, int* sync_fence_fd); - -/** - * Queries whether an extension is supported by the driver implementation of the - * specified device. - * - * @param extension The extension name. - * @param isExtensionSupported The boolean value indicating whether the - * extension is supported. - * - * @return NEURON_NO_ERROR if successful. - * - * Available since 5.0.0 - */ -// Note: Remove "device" -int NeuronDevice_getExtensionSupport( - const char* extensionName, - bool* isExtensionSupported); - -/** - * Creates an operand type from an extension name and an extension operand code. - * - * See {@link NeuronModel} for information on multithreaded usage. - * - * Available since 5.0.0 - * - * @param model The model to contain the operand. - * @param extensionName The extension name. - * @param operandCodeWithinExtension The extension operand code. - * @param type The operand type. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronModel_getExtensionOperandType( - NeuronModel* model, - const char* extensionName, - uint16_t operandCodeWithinExtension, - int32_t* type); - -/** - * Creates an operation type from an extension name and an extension operation - * code. - * - * See {@link NeuronModel} for information on multithreaded usage. - * - * Available since 5.0.0 - * - * @param model The model to contain the operation. - * @param extensionName The extension name. - * @param operationCodeWithinExtension The extension operation code. - * @param type The operation type. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronModel_getExtensionOperationType( - NeuronModel* model, - const char* extensionName, - uint16_t operationCodeWithinExtension, - int32_t* type); - -/** - * Sets extension operand parameters. - * - * Available since 5.0.0 - * - * @param model The model to be modified. - * @param index The index of the model operand we're setting. - * @param data A pointer to the extension operand data. - * The data does not have to outlive the call to this function. - * @param length The size in bytes of the data value. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronModel_setOperandExtensionData( - NeuronModel* model, - int32_t index, - const void* data, - size_t length); - -/** - * Gets the execution preference associated with this compilation. - * This function must be called after calling NeuronCompilation_finish. - * - * Available since 6.0.0 - * - * @param compilation The compilation to be queried. - * @param preference The execution preference will be one of NEURON_PREFER_*. - * Ignore preference value if this function doesn't return NEURON_NO_ERROR. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronCompilation_getPreference( - NeuronCompilation* compilation, - int* preference); - -/** - * Gets the execution priority associated with this compilation. - * This function must be called after calling NeuronCompilation_finish. - * - * Available since 6.0.0 - * - * @param compilation The compilation to be queried. - * @param priority The priority will be one of NEURON_PRIORITY_*. Ignore - * priority value if this function doesn't return NEURON_NO_ERROR. - * - * @return NEURON_NO_ERROR if successful. - */ -int NeuronCompilation_getPriority( - NeuronCompilation* compilation, - int* priority); - -int NeuronCompilation_createWithOptions( - NeuronModel* model, - NeuronCompilation** compilation, - const char* options); -__END_DECLS diff --git a/backends/nxp/TARGETS b/backends/nxp/TARGETS index d56ac60242c..875f9813f43 100644 --- a/backends/nxp/TARGETS +++ b/backends/nxp/TARGETS @@ -50,7 +50,7 @@ runtime.python_library( name = "neutron_sdk", srcs = glob(["backend/**/*.py"]), deps = [ - "fbsource//third-party/pypi/neutron_converter:neutron_converter", + "fbsource//third-party/pypi/neutron_converter:neutron_converter", ], ) @@ -68,7 +68,6 @@ runtime.python_library( ":quantizer", "fbsource//third-party/pypi/flatbuffers:flatbuffers", "fbsource//third-party/pypi/ml-dtypes:ml-dtypes", - "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/serializer:serializer", "//executorch/exir:lib", "//executorch/backends/transforms:remove_getitem_op", "//caffe2:torch", diff --git a/backends/nxp/backend/ir/converter/builder/model_builder.py b/backends/nxp/backend/ir/converter/builder/model_builder.py index 1ca46237814..4f036854138 100755 --- a/backends/nxp/backend/ir/converter/builder/model_builder.py +++ b/backends/nxp/backend/ir/converter/builder/model_builder.py @@ -412,6 +412,26 @@ def _make_outputs_channels_first(self): self.get_sub_graph().outputs.tmp_outputs = new_outputs + def _keep_one_empty_buffer(self): + """Create a single empty `Buffer` object and assign it to all tensors in the model that don't have static data.""" + empty_buffer = self.get_first_empty_buffer() + + for t in self.get_tensors().vector: + if tensor_has_data(t): + # The buffer of `t` is not empty. + continue + + if t.tmp_buffer == empty_buffer: + # Already optimized. + continue + + if t.is_variable: + # The data of the tensor will change at runtime, so it shouldn't share the buffer with other tensors. + continue + + # It's safe to replace the buffer. + t.tmp_buffer = empty_buffer + def finish(self) -> tflite_model.Model: """Finalize and optimize the converted TFLite model. Then return it. @@ -430,6 +450,8 @@ def finish(self) -> tflite_model.Model: self.conversion_config.optimization_blacklist, ) + self._keep_one_empty_buffer() + # Remove outputs, which are not produced by any node. Otherwise, there would be errors after inference. operator_outputs = [] for op in self.get_operators().vector: diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/keep_one_empty_buffer.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/keep_one_empty_buffer.py deleted file mode 100755 index 9809719fad4..00000000000 --- a/backends/nxp/backend/ir/tflite_optimizer/optimizations/keep_one_empty_buffer.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright 2024 NXP -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -from executorch.backends.nxp.backend.ir.converter.tensor_utils import tensor_has_data -from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import ( - BaseOptimization, -) - - -class KeepOneEmptyBuffer(BaseOptimization): - - def __call__(self) -> bool: - """Create a single empty `Buffer` object and assign it to all tensors in the model that don't have static data. - :return: True, if any tensors had their buffer changed. Otherwise, False. - """ - - made_changes = False - empty_buffer = self._builder.get_first_empty_buffer() - - for t in self._builder.get_tensors().vector: - if tensor_has_data(t): - # The buffer of `t` is not empty. - continue - - if t.tmp_buffer == empty_buffer: - # Already optimized. - continue - - if t.is_variable: - # The data of the tensor will change at runtime, so it shouldn't share the buffer with other tensors. - continue - - # It's safe to replace the buffer. - t.tmp_buffer = empty_buffer - made_changes = True - - return made_changes diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizer.py b/backends/nxp/backend/ir/tflite_optimizer/optimizer.py index eb4ce6a5992..d4a097ca76d 100755 --- a/backends/nxp/backend/ir/tflite_optimizer/optimizer.py +++ b/backends/nxp/backend/ir/tflite_optimizer/optimizer.py @@ -20,9 +20,6 @@ from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.fuse_fully_connected_and_add_operators import ( FuseFullyConnectedAndAddOperators, ) -from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.keep_one_empty_buffer import ( - KeepOneEmptyBuffer, -) from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.move_relu_before_concat import ( MoveActivationBeforeConcatenation, ) @@ -36,7 +33,6 @@ class Optimization(Enum): - KEEP_ONE_EMPTY_BUFFER = 0 FUSE_ACTIVATION_FUNCTIONS = 1 FUSE_FULLY_CONNECTED_AND_ADD = 2 @@ -76,9 +72,6 @@ def __init__( self._builder = builder self.optimization_map = { - Optimization.KEEP_ONE_EMPTY_BUFFER: KeepOneEmptyBuffer( - builder, conversion_config - ), Optimization.FUSE_ACTIVATION_FUNCTIONS: FuseActivationFunctions( builder, conversion_config ), diff --git a/backends/test/suite/operators/test_elu.py b/backends/test/suite/operators/test_elu.py index f768a426954..361e1382c37 100644 --- a/backends/test/suite/operators/test_elu.py +++ b/backends/test/suite/operators/test_elu.py @@ -7,6 +7,8 @@ # pyre-unsafe +import unittest + import torch from executorch.backends.test.suite.flow import TestFlow @@ -42,5 +44,6 @@ def test_elu_f32_multi_dim(self, flow: TestFlow) -> None: def test_elu_f32_alpha(self, flow: TestFlow) -> None: self._test_op(Model(alpha=0.5), (torch.randn(3, 4, 5),), flow) + @unittest.skip("In place activations aren't properly defunctionalized yet.") def test_elu_f32_inplace(self, flow: TestFlow) -> None: self._test_op(Model(inplace=True), (torch.randn(3, 4, 5),), flow) diff --git a/backends/test/suite/operators/test_hardsigmoid.py b/backends/test/suite/operators/test_hardsigmoid.py index 238b18b1e0d..8ca254d4f61 100644 --- a/backends/test/suite/operators/test_hardsigmoid.py +++ b/backends/test/suite/operators/test_hardsigmoid.py @@ -7,6 +7,8 @@ # pyre-unsafe +import unittest + import torch from executorch.backends.test.suite.flow import TestFlow @@ -38,6 +40,7 @@ def test_hardsigmoid_f32_single_dim(self, flow: TestFlow) -> None: def test_hardsigmoid_f32_multi_dim(self, flow: TestFlow) -> None: self._test_op(Model(), (torch.randn(2, 3, 4, 5),), flow) + @unittest.skip("In place activations aren't properly defunctionalized yet.") def test_hardsigmoid_f32_inplace(self, flow: TestFlow) -> None: self._test_op(Model(inplace=True), (torch.randn(3, 4, 5),), flow) diff --git a/backends/test/suite/operators/test_hardswish.py b/backends/test/suite/operators/test_hardswish.py index 66902791c33..a93516542c8 100644 --- a/backends/test/suite/operators/test_hardswish.py +++ b/backends/test/suite/operators/test_hardswish.py @@ -7,6 +7,8 @@ # pyre-unsafe +import unittest + import torch from executorch.backends.test.suite.flow import TestFlow @@ -38,6 +40,7 @@ def test_hardswish_f32_single_dim(self, flow: TestFlow) -> None: def test_hardswish_f32_multi_dim(self, flow: TestFlow) -> None: self._test_op(Model(), (torch.randn(2, 3, 4, 5),), flow) + @unittest.skip("In place activations aren't properly defunctionalized yet.") def test_hardswish_f32_inplace(self, flow: TestFlow) -> None: self._test_op(Model(inplace=True), (torch.randn(3, 4, 5),), flow) diff --git a/backends/test/suite/operators/test_hardtanh.py b/backends/test/suite/operators/test_hardtanh.py index 2fcd1dbf563..7520c3faeae 100644 --- a/backends/test/suite/operators/test_hardtanh.py +++ b/backends/test/suite/operators/test_hardtanh.py @@ -7,6 +7,8 @@ # pyre-unsafe +import unittest + import torch from executorch.backends.test.suite.flow import TestFlow @@ -45,6 +47,7 @@ def test_hardtanh_f32_multi_dim(self, flow: TestFlow) -> None: def test_hardtanh_f32_custom_range(self, flow: TestFlow) -> None: self._test_op(Model(min_val=-2.0, max_val=2.0), (torch.randn(3, 4, 5),), flow) + @unittest.skip("In place activations aren't properly defunctionalized yet.") def test_hardtanh_f32_inplace(self, flow: TestFlow) -> None: self._test_op(Model(inplace=True), (torch.randn(3, 4, 5),), flow) diff --git a/backends/test/suite/operators/test_leaky_relu.py b/backends/test/suite/operators/test_leaky_relu.py index 983da47bba3..79ed5425623 100644 --- a/backends/test/suite/operators/test_leaky_relu.py +++ b/backends/test/suite/operators/test_leaky_relu.py @@ -7,6 +7,8 @@ # pyre-unsafe +import unittest + import torch from executorch.backends.test.suite.flow import TestFlow @@ -44,6 +46,7 @@ def test_leaky_relu_f32_multi_dim(self, flow: TestFlow) -> None: def test_leaky_relu_f32_custom_slope(self, flow: TestFlow) -> None: self._test_op(Model(negative_slope=0.1), (torch.randn(3, 4, 5),), flow) + @unittest.skip("In place activations aren't properly defunctionalized yet.") def test_leaky_relu_f32_inplace(self, flow: TestFlow) -> None: self._test_op(Model(inplace=True), (torch.randn(3, 4, 5),), flow) diff --git a/backends/test/suite/operators/test_relu.py b/backends/test/suite/operators/test_relu.py index c9f416f090f..3c4ef2a98d0 100644 --- a/backends/test/suite/operators/test_relu.py +++ b/backends/test/suite/operators/test_relu.py @@ -7,6 +7,8 @@ # pyre-unsafe +import unittest + import torch from executorch.backends.test.suite.flow import TestFlow @@ -38,5 +40,6 @@ def test_relu_f32_single_dim(self, flow: TestFlow) -> None: def test_relu_f32_multi_dim(self, flow: TestFlow) -> None: self._test_op(Model(), (torch.randn(2, 3, 4, 5),), flow) + @unittest.skip("In place activations aren't properly defunctionalized yet.") def test_relu_f32_inplace(self, flow: TestFlow) -> None: self._test_op(Model(inplace=True), (torch.randn(3, 4, 5),), flow) diff --git a/backends/test/suite/operators/test_silu.py b/backends/test/suite/operators/test_silu.py index 69b6576734f..cf6d343f271 100644 --- a/backends/test/suite/operators/test_silu.py +++ b/backends/test/suite/operators/test_silu.py @@ -7,6 +7,8 @@ # pyre-unsafe +import unittest + import torch from executorch.backends.test.suite.flow import TestFlow @@ -38,6 +40,7 @@ def test_silu_f32_single_dim(self, flow: TestFlow) -> None: def test_silu_f32_multi_dim(self, flow: TestFlow) -> None: self._test_op(Model(), (torch.randn(2, 3, 4, 5),), flow) + @unittest.skip("In place activations aren't properly defunctionalized yet.") def test_silu_f32_inplace(self, flow: TestFlow) -> None: self._test_op(Model(inplace=True), (torch.randn(3, 4, 5),), flow) diff --git a/backends/test/suite/operators/test_threshold.py b/backends/test/suite/operators/test_threshold.py index 42b6fb801e5..3f69a9f41fe 100644 --- a/backends/test/suite/operators/test_threshold.py +++ b/backends/test/suite/operators/test_threshold.py @@ -7,6 +7,8 @@ # pyre-unsafe +import unittest + import torch from executorch.backends.test.suite.flow import TestFlow @@ -51,6 +53,7 @@ def test_threshold_f32_custom_value(self, flow: TestFlow) -> None: def test_threshold_f32_custom_threshold_value(self, flow: TestFlow) -> None: self._test_op(Model(threshold=0.5, value=1.0), (torch.randn(3, 4, 5),), flow) + @unittest.skip("In place activations aren't properly defunctionalized yet.") def test_threshold_f32_inplace(self, flow: TestFlow) -> None: self._test_op(Model(inplace=True), (torch.randn(3, 4, 5),), flow) diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt index 200d8987b19..33bf84b9066 100644 --- a/backends/xnnpack/CMakeLists.txt +++ b/backends/xnnpack/CMakeLists.txt @@ -59,7 +59,7 @@ foreach(fbs_file ${_xnnpack_schema__srcs}) ) endforeach() -if(WIN32 AND NOT CMAKE_CROSSCOMPILING) +if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows") set(MV_COMMAND powershell -Command "Move-Item -Path ${_xnnpack_flatbuffer__outputs} -Destination ${_xnnpack_schema__outputs} -Force" diff --git a/backends/xnnpack/test/ops/test_linear.py b/backends/xnnpack/test/ops/test_linear.py index ac6fec25732..dc92a9542a9 100644 --- a/backends/xnnpack/test/ops/test_linear.py +++ b/backends/xnnpack/test/ops/test_linear.py @@ -395,7 +395,9 @@ def _test_groupwise_dq_linear( quantize_( mod, Int8DynamicActivationIntxWeightConfig( - weight_dtype=torch.int4, weight_granularity=PerGroup(group_size) + # pyre-ignore[16] + weight_dtype=torch.int4, + weight_granularity=PerGroup(group_size), ), ) unwrap_tensor_subclass(mod) diff --git a/docs/source/backends-qualcomm.md b/docs/source/backends-qualcomm.md index f427c7c7cea..45f932da491 100644 --- a/docs/source/backends-qualcomm.md +++ b/docs/source/backends-qualcomm.md @@ -385,7 +385,7 @@ example_inputs = (torch.randn(1, 3, 224, 224),) # Example input tensor Choose between quantization approaches, post training quantization (PTQ) or quantization aware training (QAT): ```python from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer -from torch.ao.quantization.quantize_pt2e import prepare_pt2e, prepare_qat_pt2e, convert_pt2e +from torchao.quantization.pt2e.quantize_pt2e import prepare_pt2e, prepare_qat_pt2e, convert_pt2e quantizer = QnnQuantizer() m = torch.export.export(model, example_inputs, strict=True).module() diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py index 8132751f6f0..106ab35363c 100644 --- a/examples/arm/aot_arm_compiler.py +++ b/examples/arm/aot_arm_compiler.py @@ -18,6 +18,7 @@ import torch from examples.devtools.scripts.export_bundled_program import save_bundled_program +from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec from executorch.backends.arm.ethosu import EthosUCompileSpec, EthosUPartitioner from executorch.backends.arm.quantizer import ( EthosUQuantizer, @@ -386,6 +387,7 @@ def get_compile_spec( memory_mode: Optional[str] = None, quantize: bool = False, config: Optional[str] = None, + debug_mode: Optional[str] = None, ) -> TosaCompileSpec | EthosUCompileSpec | VgfCompileSpec: compile_spec = None if target.startswith("TOSA"): @@ -414,6 +416,10 @@ def get_compile_spec( if intermediates is not None: compile_spec.dump_intermediate_artifacts_to(intermediates) + if debug_mode is not None: + mode = ArmCompileSpec.DebugMode[debug_mode.upper()] + compile_spec.dump_debug_info(mode) + return compile_spec @@ -601,6 +607,12 @@ def get_args(): action="store_true", help="Enable the QuantizedOpFusionPass fusion step", ) + parser.add_argument( + "--enable_debug_mode", + required=False, + choices=["json", "tosa"], + help="Flag to enable ATen-to-TOSA debug mode.", + ) args = parser.parse_args() if args.evaluate and ( @@ -735,6 +747,7 @@ def to_edge_TOSA_delegate( args.memory_mode, args.quantize, args.config, + args.enable_debug_mode, ) model_int8 = None @@ -776,6 +789,7 @@ def to_edge_no_delegate(exported_program, args, model: torch.nn.Module, example_ args.memory_mode, args.quantize, args.config, + args.enable_debug_mode, ) model, exported_program = quantize_model( args, model, example_inputs, compile_spec @@ -824,12 +838,21 @@ def transform_for_cortex_m_backend(edge, args): exported_program = torch.export.export( model, example_inputs, strict=args.strict_export ) + model = exported_program.module() model_fp32 = model + model_name = os.path.basename(os.path.splitext(args.model_name)[0]) if args.intermediates: os.makedirs(args.intermediates, exist_ok=True) + # We only support Python3.10 and above, so use a later pickle protocol + torch.export.save( + exported_program, + f"{args.intermediates}/{model_name}_exported_program.pt2", + pickle_protocol=5, + ) + # Quantize if required model_int8 = None if args.delegate: @@ -862,7 +885,6 @@ def transform_for_cortex_m_backend(edge, args): else: raise e - model_name = os.path.basename(os.path.splitext(args.model_name)[0]) output_name = f"{model_name}" + ( f"_arm_delegate_{args.target}" if args.delegate is True diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt index ff6f73398c3..4e4a8eeb409 100644 --- a/examples/arm/executor_runner/CMakeLists.txt +++ b/examples/arm/executor_runner/CMakeLists.txt @@ -6,30 +6,59 @@ cmake_minimum_required(VERSION 3.20) project(arm_executor_runner) -option(SEMIHOSTING "Enable semihosting" OFF) -option( - ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE - "Set ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE to specify memory alloction pool size" - OFF -) option( ET_MODEL_PTE_ADDR "Place in memory that the PTE file is located/flashed, if set to OFF the PTE is built into the code as a big data area." OFF ) -option(ET_BUNDLE_IO "Set to compile in BundleIO support" OFF) -option(ET_ATOL "Set atol to use for BundleIO testing" OFF) -option(ET_RTOL "Set rtol to use for BundleIO testing" OFF) -option(ET_DUMP_INPUT "Dump input in log" OFF) -option(ET_DUMP_OUTPUT "Dump output in log" ON) -option(FETCH_ETHOS_U_CONTENT - "Fetch ethos_u dependencies instead of relying on pre-downloads" ON -) + set(ET_NUM_INFERENCES "1" CACHE STRING "Number of inferences to run" ) +option(ET_LOG_DUMP_INPUT "Dump input in log" OFF) +option(ET_LOG_DUMP_OUTPUT "Dump output in log" ON) + +option(ET_BUNDLE_IO "Set to compile in BundleIO support" OFF) +set(ET_ATOL + "0.01" + CACHE STRING "Set atol to use for BundleIO testing (Requires ET_BUNDLE_IO)" +) +set(ET_RTOL + "0.01" + CACHE STRING "Set atol to use for BundleIO testing (Requires ET_BUNDLE_IO)" +) + +option( + ET_DUMP_OUTPUTS + "Collect and print outputs as a base64 buffer in the log (Requires EXECUTORCH_ENABLE_EVENT_TRACER)" + OFF +) +option( + ET_DUMP_INTERMEDIATE_OUTPUTS + "Collect and print intermediate outputs as a base64 buffer in the log (Requires EXECUTORCH_ENABLE_EVENT_TRACER)" + OFF +) +set(ET_DEBUG_BUFFER_SIZE + "2097152" + CACHE + STRING + "Size of buffer to collect intermediate outputs/outputs buffers (Requires EXECUTORCH_ENABLE_EVENT_TRACER and ET_DUMP_OUTPUTS or ET_DUMP_INTERMEDIATE_OUTPUTS)" +) + +option(SEMIHOSTING "Enable semihosting" OFF) + +option( + ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE + "Set ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE to specify memory alloction pool size" + OFF +) + +option(FETCH_ETHOS_U_CONTENT + "Fetch ethos_u dependencies instead of relying on pre-downloads" ON +) + if(NOT DEFINED ET_MODEL_PTE_ADDR AND NOT DEFINED ET_PTE_FILE_PATH AND NOT DEFINED SEMIHOSTING @@ -322,37 +351,29 @@ if(NOT ${ET_MODEL_PTE_ADDR} AND NOT SEMIHOSTING) add_dependencies(arm_executor_runner gen_model_header) endif() -if(SEMIHOSTING) - target_compile_definitions(arm_executor_runner PUBLIC SEMIHOSTING) -endif() - -if(ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE) +if(ET_MODEL_PTE_ADDR) target_compile_definitions( - arm_executor_runner - PUBLIC - ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE} + arm_executor_runner PUBLIC -DET_MODEL_PTE_ADDR=${ET_MODEL_PTE_ADDR} ) endif() -target_compile_definitions( - arm_executor_runner - PUBLIC - ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE} -) -if(DEFINED ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE) +if(ET_NUM_INFERENCES) target_compile_definitions( - arm_executor_runner - PUBLIC - ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE} + arm_executor_runner PUBLIC ET_NUM_INFERENCES=${ET_NUM_INFERENCES} ) endif() -if(ET_MODEL_PTE_ADDR) - target_compile_definitions( - arm_executor_runner PUBLIC -DET_MODEL_PTE_ADDR=${ET_MODEL_PTE_ADDR} - ) +if(ET_LOG_DUMP_INPUT) + target_compile_definitions(arm_executor_runner PUBLIC -DET_LOG_DUMP_INPUT) +endif() + +if(ET_LOG_DUMP_OUTPUT) + target_compile_definitions(arm_executor_runner PUBLIC -DET_LOG_DUMP_OUTPUT) endif() +# Devtool BundleIO: Use Bundle PTE with input and reference output included to +# check if it matches. + if(ET_BUNDLE_IO) target_compile_definitions(arm_executor_runner PUBLIC -DET_BUNDLE_IO) endif() @@ -365,17 +386,50 @@ if(ET_RTOL) target_compile_definitions(arm_executor_runner PUBLIC ET_RTOL=${ET_RTOL}) endif() -if(ET_DUMP_INPUT) - target_compile_definitions(arm_executor_runner PUBLIC -DET_DUMP_INPUT) +# Devtools ETDump: Speed and dumping output + +if(ET_DUMP_OUTPUTS) + target_compile_definitions(arm_executor_runner PUBLIC -DET_DUMP_OUTPUTS) endif() -if(ET_DUMP_OUTPUT) - target_compile_definitions(arm_executor_runner PUBLIC -DET_DUMP_OUTPUT) +if(ET_DUMP_INTERMEDIATE_OUTPUTS) + target_compile_definitions( + arm_executor_runner PUBLIC -DET_DUMP_INTERMEDIATE_OUTPUTS + ) endif() -if(ET_NUM_INFERENCES) +if(ET_DEBUG_BUFFER_SIZE) target_compile_definitions( - arm_executor_runner PUBLIC ET_NUM_INFERENCES=${ET_NUM_INFERENCES} + arm_executor_runner PUBLIC ET_DEBUG_BUFFER_SIZE=${ET_DEBUG_BUFFER_SIZE} + ) +endif() + +# Semihosting FVP (FVP Simulator can access host filesystem) + +if(SEMIHOSTING) + target_compile_definitions(arm_executor_runner PUBLIC SEMIHOSTING) +endif() + +# Memory buffer sizes for Executorch flow + +if(ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE) + target_compile_definitions( + arm_executor_runner + PUBLIC + ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE} + ) +endif() + +target_compile_definitions( + arm_executor_runner + PUBLIC + ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE} +) +if(DEFINED ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE) + target_compile_definitions( + arm_executor_runner + PUBLIC + ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=${ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE} ) endif() diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp index d56710e27ad..696817450b5 100644 --- a/examples/arm/executor_runner/arm_executor_runner.cpp +++ b/examples/arm/executor_runner/arm_executor_runner.cpp @@ -6,10 +6,10 @@ * LICENSE file in the root directory of this source tree. */ -/* This is an example executorch runner running on Arm Cortex-m and Ethos-U +/* This is an example ExecuTorch runner running on Arm Cortex-M and Ethos-U * based hardware. This example tries to illustrate a few ways to use ExecuTorch * and you can use it as is or remove the unneeded parts. Please use this code - * as inpiration. + * as inspiration. * * Some defines used to configure the code: * @@ -20,24 +20,43 @@ * that is controlled by your memory mode via the * ETHOSU_MODEL cmake parameter. * If SEMIHOSTING is define this is not used - * ET_DUMP_INPUT - Control if you want input to be dumped to the log. - * ET_DUMP_OUTPUT - Control if you want output to be dumped to the log. - * ET_BUNDLE_IO - Build in devtools BundelIO, this makes it possible to + * ET_NUM_INFERENCES - Numbers of times to run the inference + * ET_LOG_DUMP_INPUT - Control if you want input to be dumped to the log. + * ET_LOG_DUMP_OUTPUT - Control if you want output to be dumped to the log. + * + * Devtool BundleIO: Use Bundle PTE with input and reference output included to + * check if it matches. + * + * ET_BUNDLE_IO - Build in Devtools BundleIO, this makes it possible to * use bpte with bundled input and output refdata to * compare output. * See also ET_ATOL and ET_RTOL - * ET_ATOL - The atol used to compare the output and ref data when - * using ET_BUNDLE_IO - * ET_RTOL - The rtol used to compare the output and ref data when - * using ET_BUNDLE_IO - * ET_EVENT_TRACER_ENABLED - Build in devtools event trace code to generate - * ETDump and print it base64 coded of it in the logs - * so you can get it out of your embedded target. - * This can be used to benchmark where time is spent. - * If you run on Ethos-U the delegate/commandstream - * is run in one go, this means that per op - * measurements is not possible. - * Warning: CPU time meassurements is NOT possible in the FVP simulator and a + * ET_ATOL - The atol used to compare the output and ref data + * when using ET_BUNDLE_IO ET_RTOL - The rtol used to compare the + * output and ref data when using ET_BUNDLE_IO + * + * Devtools ETDump: Speed and dumping output + * + * ET_EVENT_TRACER_ENABLED - Build in Devtools ETDump event trace code + * to generate cycle data and print it base64 + * coded in the log so you can get it out of + * your embedded target. This can be used to + * benchmark where time is spent. If you run + * on Ethos-U the delegate/commandstream is + * run in one go, this means that per op + * measurements is not possible. + * ET_DUMP_OUTPUTS - Collect and print outputs as a base64 buffer + * in the log, see ExecuTorch Devtools for more + * info. (Requires ET_EVENT_TRACER_ENABLED) + * ET_DUMP_INTERMEDIATE_OUTPUTS - Collect and print intermediate outputs as a + * base64 buffer in the log, see ExecuTorch + * Devtools for more info. + * (Requires ET_EVENT_TRACER_ENABLED) + * ET_DEBUG_BUFFER_SIZE - Override the size of memory area used by + * ET_DUMP_OUTPUTS or + * ET_DUMP_INTERMEDIATE_OUTPUTS + * + * Warning: CPU time measurements is NOT possible in the FVP simulator and a * real target or FPGA must be used. NPU number are roughly OK, and can be used * as guidance if timeing adaptor values are set correctly. * @@ -54,11 +73,12 @@ * left over memory after code is linked. This needs to be big enough to fit * and run your model. In our example using the FVP simulator we have much * memory and set this quite high to be able to test larger models. - * Regarding heap/mallocs type of allocation from executorch, + * Regarding heap/mallocs type of allocation from ExecuTorch, * et_pal_allocate() is not implemented or needed. * - * ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE - Size of memory area - * used when setting up the model + * ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE - Size of memory area + * used when setting up + * the model * ET_ARM_BAREMETAL_FAST_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE - Size of memory area * used when running * inferences @@ -86,10 +106,21 @@ #if defined(ET_EVENT_TRACER_ENABLED) #include + +#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS) +#include + +#if !defined(ET_DEBUG_BUFFER_SIZE) +#define ET_DEBUG_BUFFER_SIZE (2 * 1024 * 1024) +#endif + +#endif + #if !defined(SEMIHOSTING) #include #endif -#endif + +#endif // defined(ET_EVENT_TRACER_ENABLED) #if defined(SEMIHOSTING) @@ -158,8 +189,10 @@ using executorch::bundled_program::ErrorStats; using executorch::bundled_program::verify_method_outputs; #endif #if defined(ET_EVENT_TRACER_ENABLED) +using executorch::etdump::BufferDataSink; using executorch::etdump::ETDumpGen; using executorch::etdump::ETDumpResult; +using executorch::runtime::EventTracerDebugLogLevel; using torch::executor::etdump_result; #endif /** @@ -505,6 +538,9 @@ struct RunnerContext { Box> method; #if defined(ET_EVENT_TRACER_ENABLED) Box etdump_gen; +#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS) + void* debug_buffer; +#endif #endif #if defined(SEMIHOSTING) Box input_file_allocator; @@ -622,7 +658,60 @@ void runner_init( ET_LOG(Info, "Setting up ETDump"); ctx.etdump_gen.reset(); event_tracer_ptr = &ctx.etdump_gen.value(); -#endif + +#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS) + // Alloc debug buffer and create if and only if we need to log intermediate + // tensor outputs + ctx.debug_buffer = ctx.method_allocator->allocate(ET_DEBUG_BUFFER_SIZE, 16); + if (ctx.debug_buffer != nullptr) { + Span debug_buffer_span( + (uint8_t*)ctx.debug_buffer, ET_DEBUG_BUFFER_SIZE); + + Result result = + ctx.etdump_gen.value().set_debug_buffer(debug_buffer_span); + + if (result.ok()) { + // Everything worked, we got the buffer setup, lets enable output logging + // depending on the compile flag ET_DUMP_INTERMEDIATE_OUTPUTS e.g. + // kIntermediateOutputs or kProgramOutputs +#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) + ET_LOG( + Info, + "ETDump: Allocated intermediate output buffer size: %d at 0x%p", + ET_DEBUG_BUFFER_SIZE, + ctx.debug_buffer); + ctx.etdump_gen.value().set_event_tracer_debug_level( + EventTracerDebugLogLevel::kIntermediateOutputs); +#else // defined(ET_DUMP_INTERMEDIATE_OUTPUTS) + ET_LOG( + Info, + "ETDump: Allocated output buffer size: %d at 0x%p", + ET_DEBUG_BUFFER_SIZE, + ctx.debug_buffer); + ctx.etdump_gen.value().set_event_tracer_debug_level( + EventTracerDebugLogLevel::kProgramOutputs); +#endif // defined(ET_DUMP_INTERMEDIATE_OUTPUTS) + + } else { + // set_debug_buffer() failed + // Here we would free ctx.debug_buffer if it was possible, but we can't as + // the allocator don't support it. + ctx.debug_buffer = nullptr; + ET_LOG( + Error, + "ETDump: Could not set_debug_buffer() for output buffer size %zu error:0x%" PRIx32, + ET_DEBUG_BUFFER_SIZE, + result.error()); + } + } else { + // debug buffer allocation failed + ET_LOG( + Error, + "ETDump: Could not allocate memory for output buffer size %zu", + ET_DEBUG_BUFFER_SIZE); + } +#endif // defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS) +#endif // defined(ET_EVENT_TRACER_ENABLED) ctx.method.reset( program->load_method(ctx.method_name, &memory_manager, event_tracer_ptr)); @@ -660,7 +749,7 @@ void runner_init( ET_CHECK_MSG( status == Error::Ok, "Failed to prepare inputs 0x%" PRIx32, status); } -#if defined(ET_DUMP_INPUT) +#if defined(ET_LOG_DUMP_INPUT) { std::vector inputs((*ctx.method.value())->inputs_size()); ET_LOG(Info, "%zu inputs: ", inputs.size()); @@ -712,7 +801,7 @@ void runner_init( ET_LOG(Info, "Input prepared."); } -void log_mem_status(const RunnerContext& ctx) { +void log_mem_status(RunnerContext& ctx) { size_t executor_memsize = ctx.method_allocator->used_size() - ctx.executor_membase; @@ -765,6 +854,20 @@ void log_mem_status(const RunnerContext& ctx) { if (ctx.temp_allocator->size() > 0) { ET_LOG(Info, "temp_allocator: %zu", ctx.temp_allocator->size()); } +#if defined(ET_EVENT_TRACER_ENABLED) +#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS) + if (ctx.debug_buffer != nullptr) { + size_t outputdump_len = ctx.etdump_gen->get_data_sink()->get_used_bytes(); + ET_LOG( + Info, + "ETDump_outputs_buffer: %zu / %zu free: %zu ( used: %zu %% ) ", + outputdump_len, + ET_DEBUG_BUFFER_SIZE, + ET_DEBUG_BUFFER_SIZE - outputdump_len, + 100 * outputdump_len / ET_DEBUG_BUFFER_SIZE); + } +#endif +#endif } void print_outputs(RunnerContext& ctx) { @@ -779,7 +882,7 @@ void print_outputs(RunnerContext& ctx) { if (outputs[i].isTensor()) { Tensor tensor = outputs[i].toTensor(); #if !defined(SEMIHOSTING) -#if defined(ET_DUMP_OUTPUT) +#if defined(ET_LOG_DUMP_OUTPUT) // The output might be collected and parsed so printf() is used instead // of ET_LOG() here for (int j = 0; j < tensor.numel(); ++j) { @@ -811,7 +914,7 @@ void print_outputs(RunnerContext& ctx) { } } #endif -#else +#else //! defined(SEMIHOSTING) char out_filename[255]; snprintf(out_filename, 255, "%s-%d.bin", ctx.output_basename, i); ET_LOG(Info, "Writing output to file: %s", out_filename); @@ -819,7 +922,7 @@ void print_outputs(RunnerContext& ctx) { auto written_size = fwrite(tensor.const_data_ptr(), 1, tensor.nbytes(), out_file); fclose(out_file); -#endif +#endif //! defined(SEMIHOSTING) } else { printf("Output[%d]: Not Tensor\n", i); } @@ -835,29 +938,96 @@ void write_etdump(RunnerContext& ctx) { if (result.buf != nullptr && result.size > 0) { // On a device with no file system we can't just write it out // to the file-system so we base64 encode it and dump it on the log. + bool dump_outputs = false; int mode = base64_enc_modifier_padding | base64_dec_modifier_skipspace; - size_t len = result.size; - size_t encoded_len = base64_encoded_size(result.size, mode); + size_t etdump_len = result.size; + size_t encoded_etdump_len = base64_encoded_size(etdump_len, mode); + size_t base64buffer_len = encoded_etdump_len; +#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS) + // Make base64 buffer fit both so it can be reused istead of allocating two + // buffers. + size_t outputdump_len = 0; + size_t encoded_outputdump_len = 0; + if (ctx.debug_buffer != nullptr) { + outputdump_len = ctx.etdump_gen->get_data_sink()->get_used_bytes(); + if (outputdump_len > 0) { + encoded_outputdump_len = base64_encoded_size(outputdump_len, mode); + if (encoded_outputdump_len > 0) { + base64buffer_len = + std::max(encoded_etdump_len, encoded_outputdump_len); + dump_outputs = true; + } else { + ET_LOG( + Error, + "Problem getting the size of the base64 ETDump output buffers"); + } + } else { + ET_LOG(Error, "No ETDump output buffers saved in the data area"); + } + } +#endif + ET_LOG(Info, "[base64] buffer size: %d", base64buffer_len); + uint8_t* encoded_buf = reinterpret_cast( - ctx.method_allocator->allocate(encoded_len + 1)); + ctx.method_allocator->allocate(base64buffer_len + 1)); if (encoded_buf != nullptr) { - int ret = base64_encode( - encoded_buf, (uint8_t*)result.buf, &encoded_len, &len, mode); - encoded_buf[encoded_len] = 0x00; // Ensure null termination - ET_LOG(Info, "Writing etdump.bin [base64]"); + int ret; + const char* debug_buffer_flag = ""; + printf("#[RUN THIS]\n"); +#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS) + if (dump_outputs) { + ret = base64_encode( + encoded_buf, + (uint8_t*)ctx.debug_buffer, + &encoded_outputdump_len, + &outputdump_len, + mode); + encoded_buf[encoded_outputdump_len] = 0x00; // Ensure null termination + printf("# Writing debug_buffer.bin [base64]\n"); + printf("echo \"%s\" | base64 -d >debug_buffer.bin\n", encoded_buf); + debug_buffer_flag = "--debug_buffer_path debug_buffer.bin"; + } +#endif + ret = base64_encode( + encoded_buf, + (uint8_t*)result.buf, + &encoded_etdump_len, + &etdump_len, + mode); + encoded_buf[encoded_etdump_len] = 0x00; // Ensure null termination + printf("# Writing etdump.bin [base64]\n"); + printf("echo \"%s\" | base64 -d >etdump.bin\n", encoded_buf); + + printf("# Generate cpu cycle table with:\n"); printf( - "#[RUN THIS]\necho \"%s\" | base64 -d >etdump.bin\npython3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin --source_time_scale cycles --target_time_scale cycles\n#[END]\n", - encoded_buf); + "python3 -m devtools.inspector.inspector_cli --etdump_path etdump.bin %s --source_time_scale cycles --target_time_scale cycles\n", + debug_buffer_flag); + printf("#[END]\n"); + } else { ET_LOG( Error, "Could not allocate memory etdump base64 encoding size %zu", - encoded_len + 1); + encoded_etdump_len + 1); } } -#else - // Dump the etdump data containing profiling/debugging data to the specified - // file. +#else // !defined(SEMIHOSTING) +#if defined(ET_DUMP_INTERMEDIATE_OUTPUTS) || defined(ET_DUMP_OUTPUTS) + if (ctx.debug_buffer != nullptr) { + // Dump the etdump outputs data to a file. + size_t outputdump_len = ctx.etdump_gen->get_data_sink()->get_used_bytes(); + const char* etdump_output_filename = "debug_buffer.bin"; + ET_LOG( + Info, + "Writing etdump debug_buffer to file: %s", + etdump_output_filename); + FILE* f = fopen(etdump_output_filename, "w+"); + fwrite((uint8_t*)ctx.debug_buffer, 1, outputdump_len, f); + fclose(f); + } +#endif + + // Dump the etdump data containing profiling/debugging data to a file. etdump_result result = ctx.etdump_gen->get_etdump_data(); if (result.buf != nullptr && result.size > 0) { // On a device with a file system we can just write it out @@ -869,11 +1039,12 @@ void write_etdump(RunnerContext& ctx) { fclose(f); free(result.buf); } -#endif -#endif +#endif // !defined(SEMIHOSTING) +#endif // defined(ET_EVENT_TRACER_ENABLED) } -void verify_result(RunnerContext& ctx, const void* model_pte) { +bool verify_result(RunnerContext& ctx, const void* model_pte) { + bool model_ok = false; #if defined(ET_BUNDLE_IO) if (ctx.bundle_io) { // Check result @@ -899,6 +1070,7 @@ void verify_result(RunnerContext& ctx, const void* model_pte) { if (status == Error::Ok) { ET_LOG(Info, "Model output match expected BundleIO bpte ref data."); ET_LOG(Info, "TEST: BundleIO index[%d] Test_result: PASS", testset_idx); + model_ok = true; } else { ET_LOG( Error, @@ -906,19 +1078,24 @@ void verify_result(RunnerContext& ctx, const void* model_pte) { et_rtol, et_atol); ET_LOG(Error, "TEST: BundleIO index[%d] Test_result: FAIL", testset_idx); + ET_LOG( + Error, "Bundle verification failed with status 0x%" PRIx32, status); + model_ok = false; } - ET_CHECK_MSG( - status == Error::Ok, - "Bundle verification failed with status 0x%" PRIx32, - status); + } else { + // No checking done, assume true + model_ok = true; } -#else +#else // defined(ET_BUNDLE_IO) (void)ctx; (void)model_pte; -#endif + // No checking done, assume true + model_ok = true; +#endif // defined(ET_BUNDLE_IO) + return model_ok; } -void run_model(RunnerContext& ctx, const void* model_pte) { +bool run_model(RunnerContext& ctx, const void* model_pte) { Error status; ET_LOG(Info, "Starting running %d inferences...", num_inferences); int n = 0; @@ -946,7 +1123,10 @@ void run_model(RunnerContext& ctx, const void* model_pte) { ET_LOG(Info, "%d inferences finished", num_inferences); print_outputs(ctx); - verify_result(ctx, model_pte); + bool model_ok = verify_result(ctx, model_pte); + ET_LOG(Info, "Model run: %d", model_ok); + + return model_ok; } } // namespace @@ -1047,10 +1227,14 @@ int main(int argc, const char* argv[]) { model_pte[7]); runner_init(ctx, input_buffers, pte_size); - run_model(ctx, model_pte); + bool model_ok = run_model(ctx, model_pte); + ET_LOG(Info, "Model run: %d", model_ok); + log_mem_status(ctx); write_etdump(ctx); + ET_CHECK_MSG(model_ok == true, "Problem running model"); + ET_LOG(Info, "Program complete, exiting."); #if defined(SEMIHOSTING) _exit(0); diff --git a/examples/arm/run.sh b/examples/arm/run.sh index 77dddfe6451..8f5dec85ad4 100755 --- a/examples/arm/run.sh +++ b/examples/arm/run.sh @@ -41,6 +41,7 @@ scratch_dir_set=false toolchain=arm-none-eabi-gcc select_ops_list="aten::_softmax.out" qdq_fusion_op=false +model_explorer=false function help() { echo "Usage: $(basename $0) [options]" @@ -52,7 +53,7 @@ function help() { echo " --no_delegate Do not delegate the model (can't override builtin models)" echo " --no_quantize Do not quantize the model (can't override builtin models)" echo " --portable_kernels= TO BE DEPRECATED: Alias to select_ops_list." - echo " --select_ops_list= Comma separated list of portable (non delagated) kernels to include Default: ${select_ops_list}" + echo " --select_ops_list= Comma separated list of portable (non delegated) kernels to include Default: ${select_ops_list}" echo " NOTE: This is used when select_ops_model is not possible to use, e.g. for semihosting or bundleio." echo " See https://docs.pytorch.org/executorch/stable/kernel-library-selective-build.html for more information." echo " --target= Target to build and run for Default: ${target}" @@ -71,6 +72,7 @@ function help() { echo " --et_build_root= Executorch build output root folder to use, defaults to ${et_build_root}" echo " --scratch-dir= Path to your Ethos-U scrach dir if you not using default ${ethos_u_scratch_dir}" echo " --qdq_fusion_op Enable QDQ fusion op" + echo " --model_explorer Generate and open a visual graph of the compiled model." exit 0 } @@ -99,6 +101,7 @@ for arg in "$@"; do --et_build_root=*) et_build_root="${arg#*=}";; --scratch-dir=*) ethos_u_scratch_dir="${arg#*=}" ; scratch_dir_set=true ;; --qdq_fusion_op) qdq_fusion_op=true;; + --model_explorer) model_explorer=true ;; *) ;; esac @@ -289,6 +292,12 @@ for i in "${!test_model[@]}"; do pte_file=$(realpath ${pte_file}) + if [ "${etrecord_flag}" != "" ] ; then + etrecord_filename="${output_folder}/${model_filename}_etrecord.bin" + etrecord_filename=$(realpath ${etrecord_filename}) + etrecord_flag="--etrecord=${etrecord_filename}" + fi + [[ -f ${pte_file} ]] || { >&2 echo "Failed to generate a pte file - ${pte_file}"; exit 1; } echo "pte_data_size: $(wc -c ${pte_file})" echo "pte_file: ${pte_file}" @@ -322,10 +331,16 @@ for i in "${!test_model[@]}"; do backends/arm/scripts/build_executor_runner.sh --et_build_root="${et_build_root}" --pte="${pte_file_or_mem}" --build_type=${build_type} --target=${target} --system_config=${system_config} --memory_mode=${memory_mode} ${bundleio_flag} ${et_dump_flag} --extra_build_flags="${extra_build_flags}" --ethosu_tools_dir="${ethos_u_scratch_dir}" --toolchain="${toolchain}" --select_ops_list="${select_ops_list}" if [ "$build_only" = false ] ; then # Execute the executor_runner on FVP Simulator - backends/arm/scripts/run_fvp.sh --elf=${elf_file} ${model_data} --target=$target + + backends/arm/scripts/run_fvp.sh --elf=${elf_file} ${model_data} --target=$target ${etrecord_flag} fi set +x fi + + if [ "$model_explorer" = true ]; then + tosa_flatbuffer_path=$(find ${output_folder} -name "*TOSA*.tosa" | head -n 1) + python3 ${script_dir}/visualize.py ${tosa_flatbuffer_path} + fi done exit 0 diff --git a/examples/arm/visualize.py b/examples/arm/visualize.py new file mode 100644 index 00000000000..51fca5b3895 --- /dev/null +++ b/examples/arm/visualize.py @@ -0,0 +1,32 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse + +import model_explorer + +from executorch.devtools.visualization.visualization_utils import ( + visualize_model_explorer, +) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Visualize a model using model explorer." + ) + parser.add_argument("model_path", type=str, help="Path to the model file.") + args = parser.parse_args() + + config = model_explorer.config() + (config.add_model_from_path(args.model_path)) + + visualize_model_explorer( + config=config, + extensions=["tosa_adapter_model_explorer"], + ) + + +if __name__ == "__main__": + main() diff --git a/examples/mediatek/README.md b/examples/mediatek/README.md index c03ab0ec48c..ffc91e10e60 100755 --- a/examples/mediatek/README.md +++ b/examples/mediatek/README.md @@ -26,44 +26,61 @@ examples/mediatek # Examples Build Instructions ## Environment Setup -- Follow the instructions of **Prerequisites** and **Setup** in `backends/mediatek/scripts/README.md`. +- Follow the instructions in `backends/mediatek/README.md` to build the backend library `libneuron_backend.so`. -- Build required libraries by `backends/mediatek/scripts/mtk_build.sh` before building examples. - -## Build MediaTek Examples -1. Build the backend and the examples by exedcuting the script: +## Build MediaTek Runners +1. Build the mediatek model runner by executing the script: ```bash ./mtk_build_examples.sh ``` +This will generate the required runners in `executorch/cmake-android-out/examples/mediatek/` -## LLaMa Example Instructions +## Model Export Instructions ##### Note: Verify that localhost connection is available before running AoT Flow -1. Exporting Models to `.pte` -- In the `examples/mediatek directory`, run: +1. Download Required Files +- Download the model files from the official Hugging Face website, and move the files to the respective folder in `examples/mediatek/models/llm_models/weights/` **EXCEPT** the `config.json` file. + - The `config.json` file is already included in the model folders, which may include some modifications required for the model exportation. +- Include the calibration data (if any) under `aot_utils/llm_utils/prompts/` + +2. Exporting Models to `.pte` +- In the `examples/mediatek/ directory`, run: ```bash -source shell_scripts/export_llama.sh +source shell_scripts/export_.sh ``` - Defaults: - - `model_name` = llama3 + - `model_name` = Depends on model family. Check respective `shell_scripts/export_.sh` for info. - `num_chunks` = 4 - `prompt_num_tokens` = 128 - - `cache_size` = 1024 - - `calibration_set_name` = None + - `cache_size` = 512 + - `calibration_data_file` = None + - `precision` = A16W4 + - `platform` = DX4 + - Argument Explanations/Options: - - `model_name`: llama2/llama3 - **Note: Currently Only Tested on Llama2 7B Chat and Llama3 8B Instruct.** - - `num_chunks`: Number of chunks to split the model into. Each chunk contains the same number of decoder layers. Will result in `num_chunks` number of `.pte` files being generated. Typical values are 1, 2 and 4. + - `model_name`: View list 'Available model names' below. + - `num_chunks`: Number of chunks to split the model into. Each chunk contains the same number of decoder layers. Typical values are 1, 2 and 4. - `prompt_num_tokens`: Number of tokens (> 1) consumed each forward pass for the prompt processing stage. - `cache_size`: Cache Size. - - `calibration_set_name`: Name of calibration dataset with extension that is found inside the `aot_utils/llm_utils/prompts` directory. Example: `alpaca.txt`. If `"None"`, will use dummy data to calibrate. + - `calibration_data_file`: Name of calibration dataset with extension that is found inside the `aot_utils/llm_utils/prompts/` directory. Example: `alpaca.txt`. If `"None"`, will use dummy data to calibrate. + - `precision`: Quantization precision for the model. Available options are `["A16W4", "A16W8", "A16W16", "A8W4", "A8W8"]` + - `platform`: The platform of the device. `DX4` for Mediatek Dimensity 9400 and `DX3` for Mediatek Dimensity 9300. **Note: Export script example only tested on `.txt` file.** -2. `.pte` files will be generated in `examples/mediatek/pte` - - Users should expect `num_chunks*2` number of pte files (half of them for prompt and half of them for generation). - - Generation `.pte` files have "`1t`" in their names. - - Additionally, an embedding bin file will be generated in the weights folder where the `config.json` can be found in. [`examples/mediatek/models/llm_models/weights//embedding__fp32.bin`] +- Available model names: + - Llama: + - llama3.2-3b, llama3.2-1b, llama3, llama2 + - Qwen: + - Qwen3-4B, Qwen3-1.7B, Qwen2-7B-Instruct, Qwen2.5-3B, Qwen2.5-0.5B-Instruct, Qwen2-1.5B-Instruct + - Gemma: + - gemma2, gemma3 + - Phi: + - phi3.5, phi4 + +3. `.pte` files will be generated in `examples/mediatek/pte/` + - Users should expect `num_chunks` number of pte files. + - An embedding bin file will be generated in the weights folder where the `config.json` can be found in. [`examples/mediatek/models/llm_models/weights//embedding__fp32.bin`] - eg. For `llama3-8B-instruct`, embedding bin generated in `examples/mediatek/models/llm_models/weights/llama3-8B-instruct/` - - AoT flow will take roughly 2.5 hours (114GB RAM for `num_chunks=4`) to complete (Results will vary by device/hardware configurations) + - AoT flow will take around 30 minutes to 2.5 hours to complete (Results will vary depending on device/hardware configurations and model sizes) ### oss 1. Exporting Model to `.pte` @@ -74,26 +91,31 @@ bash shell_scripts/export_oss.sh - `model_name`: deeplabv3/edsr/inceptionv3/inceptionv4/mobilenetv2/mobilenetv3/resnet18/resnet50/dcgan/wav2letter/vit_b_16/mobilebert/emformer_rnnt/bert/distilbert # Runtime -## Environment Setup - -To set up the build environment for the `mtk_executor_runner`: - -1. Navigate to the `backends/mediatek/scripts` directory within the repository. -2. Follow the detailed build steps provided in that location. -3. Upon successful completion of the build steps, the `mtk_executor_runner` binary will be generated. - ## Deploying and Running on the Device ### Pushing Files to the Device -Transfer the `.pte` model files and the `mtk_executor_runner` binary to your Android device using the following commands: +Transfer the directory containing the `.pte` model files, the `run__sample.sh` script, the `embedding__fp32.bin`, the tokenizer file, the `mtk_llama_executor_runner` binary and the 3 `.so` files to your Android device using the following commands: ```bash -adb push mtk_executor_runner -adb push .pte +adb push mtk_llama_executor_runner +adb push examples/mediatek/executor_runner/run__sample.sh +adb push embedding__fp32.bin +adb push tokenizer.model +adb push ``` -Make sure to replace `` with the actual name of your model file. And, replace the `` with the desired detination on the device. +Make sure to replace `` with the actual name of your directory containing pte files. And, replace the `` with the desired detination on the device. + +At this point your phone directory should have the following files: +- libneuron_backend.so +- libneuronusdk_adapter.mtk.so +- libneuron_buffer_allocator.so +- mtk_llama_executor_runner +- +- tokenizer.json / tokenizer.model(for llama3) / tokenizer.bin(for phi3 and gemma2) +- embedding__fp32.bin +- run__sample.sh ##### Note: For oss models, please push additional files to your Android device ```bash @@ -107,12 +129,13 @@ for i in input*bin; do adb push "$i" ; done; Execute the model on your Android device by running: ```bash -adb shell "/data/local/tmp/mtk_executor_runner --model_path /data/local/tmp/.pte --iteration " +adb shell +cd +sh run__sample.sh ``` +#### Note: The `mtk_llama_executor_runner` is applicable to the models listed in `examples/mediatek/models/llm_models/weights/`. -In the command above, replace `` with the name of your model file and `` with the desired number of iterations to run the model. - -##### Note: For llama models, please use `mtk_llama_executor_runner`. Refer to `examples/mediatek/executor_runner/run_llama3_sample.sh` for reference. +##### Note: For non-LLM models, please run `adb shell "/data/local/tmp/mtk_executor_runner --model_path /data/local/tmp/.pte --iteration "`. ##### Note: For oss models, please use `mtk_oss_executor_runner`. ```bash adb shell "/data/local/tmp/mtk_oss_executor_runner --model_path /data/local/tmp/.pte --input_list /data/local/tmp/input_list.txt --output_folder /data/local/tmp/output_" diff --git a/examples/mediatek/executor_runner/mtk_llama_runner.h b/examples/mediatek/executor_runner/mtk_llama_runner.h index 5dd8a85005e..e82b36d9373 100644 --- a/examples/mediatek/executor_runner/mtk_llama_runner.h +++ b/examples/mediatek/executor_runner/mtk_llama_runner.h @@ -66,6 +66,8 @@ class MTKLlamaRunner : public executorch::extension::llm::IRunner { std::function token_callback); std::unique_ptr load_tokenizer(); + void reset() {} + private: // model const LlamaModelOptions modeloptions_; diff --git a/examples/mediatek/executor_runner/run_phi4_sample.sh b/examples/mediatek/executor_runner/run_phi4_sample.sh index a6d9824e178..16c4f70009c 100644 --- a/examples/mediatek/executor_runner/run_phi4_sample.sh +++ b/examples/mediatek/executor_runner/run_phi4_sample.sh @@ -49,7 +49,7 @@ chmod +x mtk_llama_executor_runner export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD -./mtk_llama_executor_runner_longrope \ +./mtk_llama_executor_runner \ --max_response=$MAX_RESPONSE \ --prompt_token_batch_size=$PROMPT_TOKEN_BATCH_SIZE \ --cache_size=$CACHE_SIZE \ diff --git a/examples/models/llama/main.cpp b/examples/models/llama/main.cpp index 25b840f260b..078d938ffde 100644 --- a/examples/models/llama/main.cpp +++ b/examples/models/llama/main.cpp @@ -105,6 +105,8 @@ int32_t main(int32_t argc, char** argv) { ET_LOG(Error, "Failed to warmup llama runner"); return 1; } + // reset kv cache pos to 0 + runner->reset(); } // generate executorch::extension::llm::GenerationConfig config{ diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py index 835972b7f3e..8b76b7650fe 100644 --- a/examples/models/llama/source_transformation/quantize.py +++ b/examples/models/llama/source_transformation/quantize.py @@ -135,6 +135,7 @@ def quantize( # noqa C901 PerAxis(0) if group_size == 0 else PerGroup(group_size) ), weight_mapping_type=MappingType.SYMMETRIC, + # pyre-ignore[6] intx_packing_format="opaque_torchao_auto", ), ) @@ -154,12 +155,23 @@ def quantize( # noqa C901 from torchao.quantization.granularity import PerGroup from torchao.utils import unwrap_tensor_subclass + def filter_fn(m, fqn): + is_linear = isinstance(m, nn.Linear) + has_shape_compatible_with_group_size = False + if is_linear: + has_shape_compatible_with_group_size = ( + m.weight.shape[1] % group_size == 0 + ) + return is_linear and has_shape_compatible_with_group_size + quantize_( model, Int8DynamicActivationIntxWeightConfig( + # pyre-ignore[16] weight_dtype=torch.int4, weight_granularity=PerGroup(group_size), ), + filter_fn=filter_fn, ) model = unwrap_tensor_subclass(model) diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py index f903e0f2ecf..7e571087c1d 100644 --- a/examples/models/llava/export_llava.py +++ b/examples/models/llava/export_llava.py @@ -77,7 +77,7 @@ def __init__(self, llava): super().__init__() self.text_model = llava.text_model - def forward(self, input_pos, embeddings): + def forward(self, embeddings, input_pos): return self.text_model(None, {"input_pos": input_pos}, embeddings) llava_text_model = LlavaTextModel(llava) @@ -88,7 +88,7 @@ def forward(self, input_pos, embeddings): max_seq_len=llava.text_model_args.max_seq_len, dtype=DType.fp32, use_kv_cache=True, - example_inputs=(torch.tensor([0], dtype=torch.int64), embeddings), + example_inputs=(embeddings, torch.tensor([0], dtype=torch.int64)), dynamic_shapes=dynamic_shapes, ) diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py index 3973d756e9c..9ff56124174 100644 --- a/examples/models/llava/model.py +++ b/examples/models/llava/model.py @@ -405,5 +405,5 @@ def _get_image_dynamic_shapes(self): def _get_prompt_dynamic_shapes(self): dim = torch.export.Dim("token_dim", min=2, max=self.max_seq_len) - text_model_dynamic_shapes = ({0: 1}, {1: dim}) + text_model_dynamic_shapes = ({1: dim}, {0: 1}) return text_model_dynamic_shapes diff --git a/examples/models/llava/runner/llava_image_prefiller.h b/examples/models/llava/runner/llava_image_prefiller.h index 9edfab85904..f5f316d0cac 100644 --- a/examples/models/llava/runner/llava_image_prefiller.h +++ b/examples/models/llava/runner/llava_image_prefiller.h @@ -47,7 +47,7 @@ class ET_EXPERIMENTAL LlavaImagePrefiller { // Run text model auto outputs_res = ET_UNWRAP(module_->execute( - kTextModelMethod, {start_pos_tensor, image_encoder_outputs[0]})); + kTextModelMethod, {image_encoder_outputs[0], start_pos_tensor})); ET_CHECK_MSG( outputs_res[0].isTensor(), "Non Tensor Output returned from executing image prefill"); diff --git a/examples/models/llava/runner/llava_text_decoder_runner.h b/examples/models/llava/runner/llava_text_decoder_runner.h index cfa92e0c253..691e2f4aa1e 100644 --- a/examples/models/llava/runner/llava_text_decoder_runner.h +++ b/examples/models/llava/runner/llava_text_decoder_runner.h @@ -34,7 +34,7 @@ class ET_EXPERIMENTAL LlavaTextDecoderRunner &start_pos, {1}, executorch::aten::ScalarType::Long); // run text model auto outputs_res = ET_UNWRAP(module_->execute( - kTextModelMethod, {start_pos_tensor, token_embedding_outputs[0]})); + kTextModelMethod, {token_embedding_outputs[0], start_pos_tensor})); ET_CHECK_MSG( outputs_res.size() == 1, diff --git a/examples/models/llava/test/test_llava.py b/examples/models/llava/test/test_llava.py index def9eaa02bd..7f2b59e0116 100644 --- a/examples/models/llava/test/test_llava.py +++ b/examples/models/llava/test/test_llava.py @@ -97,7 +97,7 @@ def test_llava_export(self): )[0] llava_module.run_method( "text_decoder", - (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_before_img), + (pte_embeds_before_img, torch.tensor([start_pos], dtype=torch.int64)), ) # Update the start_pos. start_pos is used in kv cache. The source of truth @@ -109,8 +109,8 @@ def test_llava_export(self): llava_module.run_method( "text_decoder", ( - torch.tensor([start_pos], dtype=torch.int64), pte_embeds_img, + torch.tensor([start_pos], dtype=torch.int64), ), ) @@ -123,7 +123,7 @@ def test_llava_export(self): )[0] pte_prefill_after_img = llava_module.run_method( "text_decoder", - (torch.tensor([start_pos], dtype=torch.int64), pte_embeds_after_img), + (pte_embeds_after_img, torch.tensor([start_pos], dtype=torch.int64)), )[0] # Update the logits for each prefill (kv cache) step. @@ -140,7 +140,7 @@ def test_llava_export(self): )[0] logits = llava_module.run_method( "text_decoder", - (torch.tensor([start_pos + i], dtype=torch.int64), token_embeds), + (token_embeds, torch.tensor([start_pos + i], dtype=torch.int64)), )[0] new_tokens.append(torch.argmax(logits).item()) diff --git a/examples/models/voxtral/README.md b/examples/models/voxtral/README.md index 5bc675e0615..5f4eeb2ff95 100644 --- a/examples/models/voxtral/README.md +++ b/examples/models/voxtral/README.md @@ -54,7 +54,8 @@ The exported model takes in a mel spectrogram input tensor as its audio inputs. We provide a simple way to transform raw audio data into a mel spectrogram by exporting a version of Voxtral's audio preprocessor used directly by Transformers. ``` -python -m executorch.extension.audio.mel_spectrogram --feature_size 128 --output_file voxtral_preprocessor.pte +# Export a preprocessor that can handle audio up to 5 mins (300s). +python -m executorch.extension.audio.mel_spectrogram --feature_size 128 --stack_output --max_audio_len 300 --output_file voxtral_preprocessor.pte ``` ## Building the multimodal runner diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp index 26e70c90f38..47f9f0cfb38 100644 --- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp +++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp @@ -568,6 +568,8 @@ int main(int argc, char** argv) { ET_LOG( Info, "Input list not provided. Inputs prepared with default values set."); + + // Run the method Error status = method->execute(); ET_CHECK_MSG( status == Error::Ok, @@ -575,6 +577,31 @@ int main(int argc, char** argv) { method_name, (int)status); ET_LOG(Info, "Model executed successfully."); + + // Warm up + ET_LOG(Info, "Perform %d inferences for warming up", FLAGS_warm_up); + for (int i = 0; i < FLAGS_warm_up; ++i) { + status = method->execute(); + } + + // Inference with designated iterations + auto before_exec = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < FLAGS_iteration; ++i) { + status = method->execute(); + } + auto after_exec = std::chrono::high_resolution_clock::now(); + double interval_infs = + std::chrono::duration_cast( + after_exec - before_exec) + .count() / + 1000.0; + + ET_LOG( + Info, + "%d inferences took %f ms, avg %f ms", + FLAGS_iteration, + interval_infs, + interval_infs / (float)FLAGS_iteration); } // Dump the etdump data containing profiling/debugging data to the specified diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp index 0c9be4d441d..253e083a80e 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp @@ -354,17 +354,6 @@ Error Runner::generate( const llm::GenerationConfig& config, std::function token_callback, std::function stats_callback) { - return generate_from_pos(prompt, 0, config, token_callback, stats_callback); -} - -template -Error Runner::generate_from_pos( - const std::string& prompt, - int64_t start_pos, - const llm::GenerationConfig& config, - std::function token_callback, - std::function stats_callback) { - // TODO: currently only support start_pos == 0 return generate_from_prompt_or_file( prompt, false, config, token_callback, stats_callback); } @@ -435,7 +424,8 @@ Error Runner::generate_from_prompt_or_file( stats_.first_token_ms = time_in_ms(); stats_.prompt_eval_end_ms = time_in_ms(); - // print the first token from prefill. No prev_token so use cur_token for it. + // print the first token from prefill. No prev_token so use cur_token for + // it. if (token_callback) { token_callback( ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token))); diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h index 30fba71ecef..9f290d79c75 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/runner.h +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.h @@ -72,13 +72,7 @@ class Runner : public executorch::extension::llm::IRunner { std::function token_callback = {}, std::function stats_callback = {}) override; - executorch::runtime::Error generate_from_pos( - const std::string& prompt, - int64_t start_pos, - const executorch::extension::llm::GenerationConfig& config, - std::function token_callback = {}, - std::function stats_callback = {}) - override; + executorch::runtime::Error generate_from_prompt_or_file( const std::string& prompt, bool tokenized_prompt, @@ -86,6 +80,7 @@ class Runner : public executorch::extension::llm::IRunner { std::function token_callback = {}, std::function stats_callback = {}); void stop() override {}; + void reset() override {}; executorch::runtime::Result get_decoder_model_version(); private: diff --git a/export/target_recipes.py b/export/target_recipes.py index 76e0cacc7b4..0a5ae9ce754 100644 --- a/export/target_recipes.py +++ b/export/target_recipes.py @@ -11,26 +11,32 @@ selection and combine multiple backends optimally for target hardware. """ +import sys from typing import Dict, List -import coremltools as ct +if sys.platform != "win32": + import coremltools as ct + from executorch.backends.apple.coreml.recipes import CoreMLRecipeType # pyre-ignore -from executorch.backends.apple.coreml.recipes import CoreMLRecipeType from executorch.backends.xnnpack.recipes import XNNPackRecipeType from executorch.export.recipe import ExportRecipe, RecipeType ## IOS Target configs # The following list of recipes are not exhaustive for CoreML; refer to CoreMLRecipeType for more detailed recipes. -IOS_CONFIGS: Dict[str, List[RecipeType]] = { - # pyre-ignore - "ios-arm64-coreml-fp32": [CoreMLRecipeType.FP32, XNNPackRecipeType.FP32], - # pyre-ignore - "ios-arm64-coreml-fp16": [CoreMLRecipeType.FP16], - # pyre-ignore - "ios-arm64-coreml-int8": [CoreMLRecipeType.PT2E_INT8_STATIC], -} +IOS_CONFIGS: Dict[str, List[RecipeType]] = ( + { + # pyre-ignore + "ios-arm64-coreml-fp32": [CoreMLRecipeType.FP32, XNNPackRecipeType.FP32], + # pyre-ignore + "ios-arm64-coreml-fp16": [CoreMLRecipeType.FP16], + # pyre-ignore + "ios-arm64-coreml-int8": [CoreMLRecipeType.PT2E_INT8_STATIC], + } + if sys.platform != "win32" + else {} +) def _create_target_recipe( diff --git a/export/tests/test_target_recipes.py b/export/tests/test_target_recipes.py index d781ffea945..7a2a7c87342 100644 --- a/export/tests/test_target_recipes.py +++ b/export/tests/test_target_recipes.py @@ -7,10 +7,10 @@ # pyre-strict import logging +import sys import unittest import torch -from executorch.backends.apple.coreml.recipes import CoreMLRecipeProvider # pyre-ignore from executorch.backends.xnnpack.recipes.xnnpack_recipe_provider import ( XNNPACKRecipeProvider, ) @@ -18,6 +18,11 @@ from executorch.export.target_recipes import get_ios_recipe from executorch.runtime import Runtime +if sys.platform != "win32": + from executorch.backends.apple.coreml.recipes import ( # pyre-ignore + CoreMLRecipeProvider, + ) + class TestTargetRecipes(unittest.TestCase): """Test target recipes.""" @@ -26,12 +31,14 @@ def setUp(self) -> None: torch._dynamo.reset() super().setUp() recipe_registry.register_backend_recipe_provider(XNNPACKRecipeProvider()) - # pyre-ignore - recipe_registry.register_backend_recipe_provider(CoreMLRecipeProvider()) + if sys.platform != "win32": + # pyre-ignore + recipe_registry.register_backend_recipe_provider(CoreMLRecipeProvider()) def tearDown(self) -> None: super().tearDown() + @unittest.skipIf(sys.platform == "win32", "Core ML is not available on Windows.") def test_ios_fp32_recipe_with_xnnpack_fallback(self) -> None: # Linear ops skipped by coreml but handled by xnnpack class Model(torch.nn.Module): @@ -107,6 +114,7 @@ def forward(self, x, y): et_output = session.run_method("forward", example_inputs[0]) logging.info(f"et output {et_output}") + @unittest.skipIf(sys.platform == "win32", "Core ML is not available on Windows.") def test_ios_quant_recipes(self) -> None: class Model(torch.nn.Module): def __init__(self): diff --git a/extension/evalue_util/test/print_evalue_test.cpp b/extension/evalue_util/test/print_evalue_test.cpp index b881e55d8a8..242cb0af224 100644 --- a/extension/evalue_util/test/print_evalue_test.cpp +++ b/extension/evalue_util/test/print_evalue_test.cpp @@ -267,7 +267,7 @@ TEST(PrintEvalueTest, UnelidedBoolLists) { // case; the other scalar types use the same underlying code, so they don't // need to test this again. { - EValue value(ArrayRef(list.data(), 0ul)); + EValue value(ArrayRef(list.data(), static_cast(0ul))); expect_output(value, "(len=0)[]"); } { @@ -419,7 +419,7 @@ TEST(PrintEvalueTest, UnelidedDoubleLists) { std::array list = {-2.2, -1, 0, INFINITY, NAN, 3.3}; { - EValue value(ArrayRef(list.data(), 0ul)); + EValue value(ArrayRef(list.data(), static_cast(0ul))); expect_output(value, "(len=0)[]"); } { diff --git a/extension/flat_tensor/test/CMakeLists.txt b/extension/flat_tensor/test/CMakeLists.txt index c3296dc61f3..fd3d6792f90 100644 --- a/extension/flat_tensor/test/CMakeLists.txt +++ b/extension/flat_tensor/test/CMakeLists.txt @@ -23,7 +23,7 @@ add_custom_command( "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd" COMMAND ${PYTHON_EXECUTABLE} -m test.models.export_program --modules "ModuleAddMul" - --external-constants --outdir "${CMAKE_CURRENT_BINARY_DIR}" 2> /dev/null + --external-constants --outdir "${CMAKE_CURRENT_BINARY_DIR}" WORKING_DIRECTORY ${EXECUTORCH_ROOT} ) diff --git a/extension/llm/runner/irunner.h b/extension/llm/runner/irunner.h index 5bd5ef9d04e..ef93f32319c 100644 --- a/extension/llm/runner/irunner.h +++ b/extension/llm/runner/irunner.h @@ -125,39 +125,18 @@ class ET_EXPERIMENTAL IRunner { std::function token_callback, std::function stats_callback) = 0; - /** - * Generate text based on the provided prompt and generation config, from a - * given position in KV cache. - * - * @param prompt The input prompt to generate from - * @param start_pos The starting position in KV cache of the input. Note: - * Depending on the actual implementation, a runner may manage the position - * internally, and this may not be respected. - * @param config Generation configuration parameters - * @param token_callback Callback function called for each generated token - * @param stats_callback Callback function for generation statistics - * @return Error::Ok if successful, an error otherwise - */ - virtual runtime::Error generate_from_pos( - const std::string& prompt, - int64_t start_pos, - const GenerationConfig& config, - std::function token_callback, - std::function stats_callback) = 0; /** * Stop the generation process. */ virtual void stop() = 0; + /** * Force remove prefilled tokens and reset KV cache start position * - * For some existing runners, overriding this method is not needed because - * start_pos is passed as an argument to generate_from_pos. - * * This method removes the prefilled tokens from the KV cache and resets the * start position to 0. */ - virtual void reset() {}; + virtual void reset() = 0; }; } // namespace llm diff --git a/extension/llm/runner/multimodal_decoder_runner.h b/extension/llm/runner/multimodal_decoder_runner.h index f76b8c64028..c8db3e57000 100644 --- a/extension/llm/runner/multimodal_decoder_runner.h +++ b/extension/llm/runner/multimodal_decoder_runner.h @@ -48,7 +48,7 @@ class ET_EXPERIMENTAL MultimodalDecoderRunner &start_pos, {1}, executorch::aten::ScalarType::Long); // run text model auto outputs_res = ET_UNWRAP( - module_->execute(kTextModelMethod, {start_pos_tensor, embeddings})); + module_->execute(kTextModelMethod, {embeddings, start_pos_tensor})); ET_CHECK_MSG( outputs_res.size() == 1, diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp index 1d9a0c8fdfc..2705a9eadff 100644 --- a/extension/llm/runner/multimodal_prefiller.cpp +++ b/extension/llm/runner/multimodal_prefiller.cpp @@ -91,24 +91,22 @@ Result MultimodalPrefiller::prefill( } // 2. Run decoder model for prefill. - // `cache_position` goes from start_pos to start_pos + encoder_output.size(1). - // e.g. if start_pos = 2 and encoder_output.size(1) = 5, - // cache_position_tensor should be [2, 3, 4, 5, 6]. + + // Get expected shape of cache position tensor, which should be the second + // argument + int64_t seq_len = encoder_output.toTensor().size(1); if (seq_len == 0) { ET_LOG(Error, "The encoder returned an empty output."); return ::executorch::runtime::Error::InvalidState; } - std::vector cache_positions(seq_len); - for (int64_t i = 0; i < seq_len; ++i) { - cache_positions[i] = start_pos + i; - } - auto cache_position_tensor = ::executorch::extension::from_blob( - cache_positions.data(), - {static_cast(seq_len)}, - executorch::aten::ScalarType::Long); + std::vector cache_positions; + + auto cache_position_tensor = ET_UNWRAP(populate_start_pos_or_cache_position( + module_, start_pos, cache_positions, seq_len, kTextModelMethod)); + auto prefill_result = module_->execute( - kTextModelMethod, {cache_position_tensor, encoder_output}); + kTextModelMethod, {encoder_output, cache_position_tensor}); if (prefill_result.error() != ::executorch::runtime::Error::Ok) { return prefill_result.error(); } diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp index 27c00c19089..7cd7623f58f 100644 --- a/extension/llm/runner/text_decoder_runner.cpp +++ b/extension/llm/runner/text_decoder_runner.cpp @@ -36,37 +36,11 @@ ::executorch::runtime::Result TextDecoderRunner::step( // If only 1 input, we are not using kv cache bool use_kv_cache = method_meta.num_inputs() > 1; + std::vector cache_positions; + if (use_kv_cache) { - // Size of the second argument. This could be either input_pos or - // cache_positions - - // Check if we are using cache positions instead of input pos. - auto second_input_info = ET_UNWRAP(method_meta.input_tensor_meta(1)); - // For input_pos, numel is 1, for cache_positions, numel is max_seq_len - auto sizes = second_input_info.sizes(); - // Assuming 1D tensor - ET_CHECK_OR_RETURN_ERROR( - sizes.size() == 1, - InvalidProgram, - "The second input tensor is not 1D tensor. Got dimension (%zu)", - sizes.size()); - auto numel = sizes[0]; - std::vector<::executorch::aten::SizesType> sizes_vec = {numel}; - - TensorPtr start_pos_tensor; - if (numel > 1) { - // If we are here, model is exported with cache_positions, create a tensor - // with the same length as input_ids. Assuming the last dimension is the - // one with the variable token length, for example [1, S] or [1, 1, S] - sizes_vec[sizes_vec.size() - 1] = tokens->numel(); - start_pos_tensor = empty(sizes_vec, ::executorch::aten::ScalarType::Long); - torch::executor::native::arange_out_impl( - start_pos, start_pos + tokens->numel(), 1.0, *start_pos_tensor); - } else { - // Assuming model is exported with input_pos, create a tensor with size 1 - start_pos_tensor = from_blob( - &start_pos, sizes_vec, ::executorch::aten::ScalarType::Long); - } + auto start_pos_tensor = ET_UNWRAP(populate_start_pos_or_cache_position( + module_, start_pos, cache_positions, tokens->numel(), "forward")); std::vector inputs; auto inputs_res = io_manager_->prepare_decode(tokens, start_pos_tensor); diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp index b6f41fd7af6..333716ac831 100644 --- a/extension/llm/runner/text_llm_runner.cpp +++ b/extension/llm/runner/text_llm_runner.cpp @@ -43,7 +43,8 @@ TextLLMRunner::TextLLMRunner( io_manager_(std::move(io_manager)), text_token_generator_(std::move(text_token_generator)), stats_(std::move(stats)), - temperature_(temperature) { + temperature_(temperature), + pos_(0) { // Note: This constructor assumes that text_prefiller and text_token_generator // already have references to the Module and TextDecoderRunner they need } @@ -70,9 +71,8 @@ Error TextLLMRunner::load() { ET_LOG(Info, format, __VA_ARGS__); \ } -Error TextLLMRunner::generate_from_pos( +Error TextLLMRunner::generate( const std::string& prompt, - ET_UNUSED int64_t start_pos, const GenerationConfig& config, std::function token_callback, std::function stats_callback) { @@ -217,15 +217,6 @@ Error TextLLMRunner::generate_from_pos( return Error::Ok; } -Error TextLLMRunner::generate( - const std::string& prompt, - const GenerationConfig& config, - std::function token_callback, - std::function stats_callback) { - pos_ = 0; - return generate_from_pos(prompt, 0, config, token_callback, stats_callback); -} - Error TextLLMRunner::warmup(const std::string& prompt, int32_t max_new_tokens) { // Create a GenerationConfig for warmup GenerationConfig config{ diff --git a/extension/llm/runner/text_llm_runner.h b/extension/llm/runner/text_llm_runner.h index 21b77fe1dfa..9dd99d82d59 100644 --- a/extension/llm/runner/text_llm_runner.h +++ b/extension/llm/runner/text_llm_runner.h @@ -101,25 +101,6 @@ class ET_EXPERIMENTAL TextLLMRunner : public IRunner { std::function token_callback = {}, std::function stats_callback = {}) override; - /** - * Generate text based on the provided prompt and generation config, from a - * given position in KV cache. - * - * @param prompt The input prompt to generate from - * @param start_pos [Unused] The starting position in KV cache of the input, - * ignored because the runner manages the position internally. - * @param config Generation configuration parameters - * @param token_callback Callback function called for each generated token - * @param stats_callback Callback function for generation statistics - * @return Error::Ok if successful, an error otherwise - */ - ET_DEPRECATED runtime::Error generate_from_pos( - const std::string& prompt, - ET_UNUSED int64_t start_pos, - const GenerationConfig& config, - std::function token_callback = {}, - std::function stats_callback = {}) override; - /** * @brief Warms up the model with a sample prompt * @@ -133,6 +114,7 @@ class ET_EXPERIMENTAL TextLLMRunner : public IRunner { ::executorch::runtime::Error warmup( const std::string& prompt, int32_t max_new_tokens); + /** * @brief Remove prefilled tokens and reset start position, and stats. * @@ -140,6 +122,7 @@ class ET_EXPERIMENTAL TextLLMRunner : public IRunner { * start position to 0. It also clears the stats for previous runs. */ void reset() override; + /** * @brief Stops the ongoing text generation process * diff --git a/extension/llm/runner/util.h b/extension/llm/runner/util.h index 0cb2463d163..5aff2c8a3b5 100644 --- a/extension/llm/runner/util.h +++ b/extension/llm/runner/util.h @@ -7,6 +7,9 @@ */ #pragma once +#include +#include +#include #include #include #include @@ -99,6 +102,48 @@ ET_EXPERIMENTAL size_t inline get_rss_bytes() { // when this changed. return 0; } + +// Returns the cache position tensor, which can be either a single start_pos +// (when the method_name [`text_decoder` or `forward`] expects a tensor with +// size 1 because model will populate the cache position tensor underneath), or +// a populated tensor for cache position, for the given start_pos and seq_len. +inline runtime::Result populate_start_pos_or_cache_position( + Module* module, + int64_t& start_pos, + std::vector& cache_positions_vec, + int seq_len, + const char* method_name = "forward") { + // Get expected shape of cache position tensor, which should be the second + // argument + auto method_meta = ET_UNWRAP(module->method_meta(method_name)); + auto second_input_info = ET_UNWRAP(method_meta.input_tensor_meta(1)); + auto second_input_sizes = second_input_info.sizes(); + auto numel = second_input_sizes[0]; + + for (int i = 0; i < second_input_sizes.size(); ++i) { + ET_LOG(Error, "second_input_sizes[%d] = %d", i, second_input_sizes[i]); + } + + TensorPtr start_pos_tensor; + if (numel > 1) { + // `cache_position` goes from start_pos to start_pos + + // encoder_output.size(1). e.g. if start_pos = 2 and encoder_output.size(1) + // = 5, cache_position_tensor should be [2, 3, 4, 5, 6]. + cache_positions_vec.resize(seq_len); + for (int64_t i = 0; i < seq_len; ++i) { + cache_positions_vec[i] = start_pos + i; + } + return ::executorch::extension::from_blob( + cache_positions_vec.data(), + {static_cast(seq_len)}, + executorch::aten::ScalarType::Long); + } else { + // Cache position is size 1. + return ::executorch::extension::from_blob( + &start_pos, {1}, executorch::aten::ScalarType::Long); + } +} + } // namespace llm } // namespace extension } // namespace executorch diff --git a/extension/module/test/CMakeLists.txt b/extension/module/test/CMakeLists.txt index 964b810eed5..1c4358dd73e 100644 --- a/extension/module/test/CMakeLists.txt +++ b/extension/module/test/CMakeLists.txt @@ -24,10 +24,10 @@ add_custom_command( "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.pte" "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd" COMMAND ${PYTHON_EXECUTABLE} -m test.models.export_program --modules - "ModuleAdd" --outdir "${CMAKE_CURRENT_BINARY_DIR}" 2> /dev/null + "ModuleAdd" --outdir "${CMAKE_CURRENT_BINARY_DIR}" COMMAND ${PYTHON_EXECUTABLE} -m test.models.export_program --modules "ModuleAddMul" - --external-constants --outdir "${CMAKE_CURRENT_BINARY_DIR}" 2> /dev/null + --external-constants --outdir "${CMAKE_CURRENT_BINARY_DIR}" WORKING_DIRECTORY ${EXECUTORCH_ROOT} ) diff --git a/extension/runner_util/test/CMakeLists.txt b/extension/runner_util/test/CMakeLists.txt index 0cca06178cd..44b85a7fced 100644 --- a/extension/runner_util/test/CMakeLists.txt +++ b/extension/runner_util/test/CMakeLists.txt @@ -20,7 +20,7 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake) add_custom_command( OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/ModuleAdd.pte" COMMAND ${PYTHON_EXECUTABLE} -m test.models.export_program --modules - "ModuleAdd" --outdir "${CMAKE_CURRENT_BINARY_DIR}" 2> /dev/null + "ModuleAdd" --outdir "${CMAKE_CURRENT_BINARY_DIR}" WORKING_DIRECTORY ${EXECUTORCH_ROOT} ) diff --git a/extension/testing_util/temp_file.h b/extension/testing_util/temp_file.h index aa8f5bcc82e..4edaf2135d8 100644 --- a/extension/testing_util/temp_file.h +++ b/extension/testing_util/temp_file.h @@ -9,13 +9,11 @@ #pragma once #include +#include #include #include #include // open() -#include // tmpnam(), remove() -#include // write(), close() - #include namespace executorch { @@ -72,19 +70,13 @@ class TempFile { } // Write the contents to the file. - int fd = open( - path.c_str(), - // O_EXCL ensures that we are the ones creating this file, to help - // protect against race conditions. - O_CREAT | O_EXCL | O_RDWR, - // User can read and write, group can read. - S_IRUSR | S_IWUSR | S_IRGRP); - ASSERT_GE(fd, 0) << "open(" << path << ") failed: " << strerror(errno); - - ssize_t nwrite = write(fd, data, size); - ASSERT_EQ(nwrite, size) << "Failed to write " << size << " bytes (wrote " - << nwrite << "): " << strerror(errno); - close(fd); + std::ofstream file(path, std::ios::out | std::ios::binary); + ASSERT_TRUE(file.is_open()) + << "open(" << path << ") failed: " << strerror(errno); + + file.write((const char*)data, size); + ASSERT_TRUE(file.good()) + << "Failed to write " << size << " bytes: " << strerror(errno); *out_path = path; } diff --git a/extension/threadpool/CMakeLists.txt b/extension/threadpool/CMakeLists.txt index 5bb647d3a09..a6c06e84293 100644 --- a/extension/threadpool/CMakeLists.txt +++ b/extension/threadpool/CMakeLists.txt @@ -20,16 +20,6 @@ if(NOT CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 17) endif() -# Threadpool size specifiers. Mutual exclusion is checking in default.cmake. -# Default to using performance cores if -# EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES isn't set. -set(_threadpool_size_flag) -if(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES) - set(_threadpool_size_flag "EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES") -else() - set(_threadpool_size_flag "EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES") -endif() - add_library( extension_threadpool threadpool.cpp threadpool_guard.cpp thread_parallel.cpp cpuinfo_utils.cpp @@ -46,9 +36,7 @@ target_include_directories( $ $ ) -target_compile_definitions( - extension_threadpool PUBLIC ET_USE_THREADPOOL ${_threadpool_size_flag} -) +target_compile_definitions(extension_threadpool PUBLIC ET_USE_THREADPOOL) target_compile_options(extension_threadpool PUBLIC ${_common_compile_options}) # Install libraries diff --git a/extension/threadpool/targets.bzl b/extension/threadpool/targets.bzl index 1889cb650ad..6ef55c42434 100644 --- a/extension/threadpool/targets.bzl +++ b/extension/threadpool/targets.bzl @@ -22,7 +22,6 @@ def define_common_targets(): name = "threadpool_lib", srcs = _THREADPOOL_SRCS, deps = [ - ":cpuinfo_utils", "//executorch/runtime/core:core", "//executorch/runtime/core/portable_type/c10/c10:c10", ], diff --git a/extension/threadpool/test/threadpool_test.cpp b/extension/threadpool/test/threadpool_test.cpp index 052e6c22f5e..e7784d3cc11 100644 --- a/extension/threadpool/test/threadpool_test.cpp +++ b/extension/threadpool/test/threadpool_test.cpp @@ -7,7 +7,6 @@ */ #include -#include #include #include @@ -72,8 +71,6 @@ void run_lambda_with_size( } // namespace TEST(ThreadPoolTest, ParallelAdd) { - executorch::runtime::runtime_init(); - std::vector a, b, c, c_ref; size_t vector_size = 100; size_t grain_size = 10; @@ -114,8 +111,6 @@ TEST(ThreadPoolTest, ParallelAdd) { // Test parallel reduction where we acquire lock within lambda TEST(ThreadPoolTest, ParallelReduce) { - executorch::runtime::runtime_init(); - std::vector a; int32_t c = 0, c_ref = 0; size_t vector_size = 100; @@ -149,8 +144,6 @@ TEST(ThreadPoolTest, ParallelReduce) { // Copied from // caffe2/aten/src/ATen/test/test_thread_pool_guard.cp TEST(TestNoThreadPoolGuard, TestThreadPoolGuard) { - executorch::runtime::runtime_init(); - auto threadpool_ptr = ::executorch::extension::threadpool::get_pthreadpool(); ASSERT_NE(threadpool_ptr, nullptr); @@ -180,8 +173,6 @@ TEST(TestNoThreadPoolGuard, TestThreadPoolGuard) { } TEST(TestNoThreadPoolGuard, TestRunWithGuard) { - executorch::runtime::runtime_init(); - const std::vector array = {1, 2, 3}; auto pool = ::executorch::extension::threadpool::get_threadpool(); diff --git a/extension/threadpool/threadpool.cpp b/extension/threadpool/threadpool.cpp index 72265e4cf07..5fee732b053 100644 --- a/extension/threadpool/threadpool.cpp +++ b/extension/threadpool/threadpool.cpp @@ -6,7 +6,6 @@ * LICENSE file in the root directory of this source tree. */ -#include #include #include @@ -15,26 +14,9 @@ #include #include -#include #include -// At most one mode should be set. -#if ( \ - defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES) && \ - defined(EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES)) -#error Multiple \ - threadpool size specifiers are set.At most one of \ - EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES, \ - and EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES may be defined. -#endif - -// Default to EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES if no mode is set. -#if !defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES) && \ - !defined(EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES) -#define EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES 1 -#endif - namespace executorch::extension::threadpool { #if !(defined(WIN32)) @@ -114,25 +96,12 @@ void ThreadPool::run( // get_threadpool is not thread safe due to leak_corrupted_threadpool // Make this part threadsafe: TODO(kimishpatel) ThreadPool* get_threadpool() { - executorch::runtime::runtime_init(); - if (!cpuinfo_initialize()) { ET_LOG(Error, "cpuinfo initialization failed"); return nullptr; // NOLINT(facebook-hte-NullableReturn) } - // Choose the number of threads according to the EXECUTORCH_THREADPOOL_ - // options. See the description in threadpool.h. - -#if defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES) - // Use threads=cores. - static int num_threads = cpuinfo_get_processors_count(); -#else - // Set threads equal to the number of performance cores. - static int num_threads = - ::executorch::extension::cpuinfo::get_num_performant_cores(); -#endif - + int num_threads = cpuinfo_get_processors_count(); /* * For llvm-tsan, holding limit for the number of locks for a single thread * is 63 (because of comparison < 64 instead of <=). pthreadpool's worst diff --git a/extension/threadpool/threadpool.h b/extension/threadpool/threadpool.h index 16acad6e5fa..3ad2d1d48d4 100644 --- a/extension/threadpool/threadpool.h +++ b/extension/threadpool/threadpool.h @@ -14,22 +14,6 @@ #include -/* - * Threadpool Options: - * - * Threadpool size has a sizble affect on performance. By default, the - * threadpool will be sized according to the number of performance cores. This - * behavior can be overriden with the following build-time options. Note that - * these options are mutually exclusive. - * - * - EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES (flag) - Sizes the threadpool - * equal to the number of performance cores on the system. This is the default - * behavior. - * - EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES (flag) - Sizes the threadpool - * equal to the number of logical cores on system. This is the historical - * behavior. - */ - namespace executorch::extension::threadpool { class ThreadPool final { diff --git a/kernels/portable/cpu/op_argmax.cpp b/kernels/portable/cpu/op_argmax.cpp index e9a561366f7..a48c152133b 100644 --- a/kernels/portable/cpu/op_argmax.cpp +++ b/kernels/portable/cpu/op_argmax.cpp @@ -49,13 +49,13 @@ Tensor& argmax_out( static constexpr const char op_name[] = "argmax.out"; ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] { - long* out_data = out.mutable_data_ptr(); + int64_t* out_data = out.mutable_data_ptr(); const bool success = parallel_for_each_reduce_over_dim_output_index( in, dim, out, [&](const auto begin, const auto end) { for (const auto out_ix : c10::irange(begin, end)) { - std::tuple acc = reduce_over_dim( - [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) { + std::tuple acc = reduce_over_dim( + [](CTYPE v, int64_t ix, CTYPE acc_val, int64_t acc_ix) { // the below condition as written is equivalent to // !isnan(accval) && (isnan(v) || v > acc_val). See // argument in op_argmin.cpp. @@ -63,7 +63,7 @@ Tensor& argmax_out( acc_val = v; acc_ix = ix; } - return std::tuple{acc_val, acc_ix}; + return std::tuple{acc_val, acc_ix}; }, in, dim, diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp index fda9463c5ee..55f2f82b04b 100644 --- a/kernels/portable/cpu/op_argmin.cpp +++ b/kernels/portable/cpu/op_argmin.cpp @@ -49,13 +49,13 @@ Tensor& argmin_out( static constexpr const char op_name[] = "argmin.out"; ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] { - long* out_data = out.mutable_data_ptr(); + int64_t* out_data = out.mutable_data_ptr(); const bool success = parallel_for_each_reduce_over_dim_output_index( in, dim, out, [&](const auto begin, const auto end) { for (const auto out_ix : c10::irange(begin, end)) { - std::tuple acc = reduce_over_dim( - [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) { + std::tuple acc = reduce_over_dim( + [](CTYPE v, int64_t ix, CTYPE acc_val, int64_t acc_ix) { // the below condition as written is equivalent to // !isnan(accval) && (isnan(v) || v < acc_val). cases: // - if neither acc_val nor v is NaN, !(v >= acc_val) is @@ -70,7 +70,7 @@ Tensor& argmin_out( acc_val = v; acc_ix = ix; } - return std::tuple{acc_val, acc_ix}; + return std::tuple{acc_val, acc_ix}; }, in, dim, diff --git a/kernels/portable/cpu/op_clamp.cpp b/kernels/portable/cpu/op_clamp.cpp index b3aa41cda85..8ac78fd5477 100644 --- a/kernels/portable/cpu/op_clamp.cpp +++ b/kernels/portable/cpu/op_clamp.cpp @@ -45,9 +45,9 @@ ET_NODISCARD bool check_bounds( static constexpr const char op_name[] = "clamp.out"; if (isIntegralType(out_type, /*includeBool=*/false)) { - const long val_long = utils::scalar_to(val_scalar); + const int64_t val_long = utils::scalar_to(val_scalar); ET_SWITCH_INT_TYPES(out_type, ctx, op_name, CTYPE_OUT, [&]() { - if (is_out_of_bounds(val_long)) { + if (is_out_of_bounds(val_long)) { ET_LOG(Error, "%s value out of bounds", val_name); is_valid = false; } diff --git a/kernels/portable/cpu/op_gather.cpp b/kernels/portable/cpu/op_gather.cpp index 9899c21a94e..02ea502ca63 100644 --- a/kernels/portable/cpu/op_gather.cpp +++ b/kernels/portable/cpu/op_gather.cpp @@ -30,7 +30,7 @@ void gather_helper( Tensor& out, int64_t dim) { const CTYPE* in_data = in.const_data_ptr(); - const long* index_data = index.const_data_ptr(); + const int64_t* index_data = index.const_data_ptr(); CTYPE* out_data = out.mutable_data_ptr(); if (index.dim() == 0) { diff --git a/kernels/portable/cpu/op_max.cpp b/kernels/portable/cpu/op_max.cpp index 7df93470d39..467c8ccffd5 100644 --- a/kernels/portable/cpu/op_max.cpp +++ b/kernels/portable/cpu/op_max.cpp @@ -82,19 +82,19 @@ std::tuple max_out( ET_SWITCH_REALHBBF16_TYPES( in.scalar_type(), ctx, "max.dim_max", CTYPE, [&]() { CTYPE* max_data = max.mutable_data_ptr(); - long* max_indices_data = max_indices.mutable_data_ptr(); + int64_t* max_indices_data = max_indices.mutable_data_ptr(); const bool success = parallel_for_each_reduce_over_dim_output_index( in, dim, max, [&](const auto begin, const auto end) { for (const auto out_ix : c10::irange(begin, end)) { - std::tuple acc = reduce_over_dim( - [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) { + std::tuple acc = reduce_over_dim( + [](CTYPE v, int64_t ix, CTYPE acc_val, int64_t acc_ix) { if (!utils::isnan_override(acc_val) && (utils::isnan_override(v) || v > acc_val)) { acc_val = v; acc_ix = ix; } - return std::tuple{acc_val, acc_ix}; + return std::tuple{acc_val, acc_ix}; }, in, dim, diff --git a/kernels/portable/cpu/op_min.cpp b/kernels/portable/cpu/op_min.cpp index a4cd1be2067..304321bb9f8 100644 --- a/kernels/portable/cpu/op_min.cpp +++ b/kernels/portable/cpu/op_min.cpp @@ -82,19 +82,19 @@ std::tuple min_out( ET_SWITCH_REALHBBF16_TYPES( in.scalar_type(), ctx, "min.dim_min", CTYPE, [&]() { CTYPE* min_data = min.mutable_data_ptr(); - long* min_indices_data = min_indices.mutable_data_ptr(); + int64_t* min_indices_data = min_indices.mutable_data_ptr(); const bool success = parallel_for_each_reduce_over_dim_output_index( in, dim, min, [&](const auto begin, const auto end) { for (const auto out_ix : c10::irange(begin, end)) { - std::tuple acc = reduce_over_dim( - [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) { + std::tuple acc = reduce_over_dim( + [](CTYPE v, int64_t ix, CTYPE acc_val, int64_t acc_ix) { if (!utils::isnan_override(acc_val) && (utils::isnan_override(v) || v < acc_val)) { acc_val = v; acc_ix = ix; } - return std::tuple{acc_val, acc_ix}; + return std::tuple{acc_val, acc_ix}; }, in, dim, diff --git a/kernels/portable/cpu/op_scatter.cpp b/kernels/portable/cpu/op_scatter.cpp index 58341cefb1e..42d40c8284d 100644 --- a/kernels/portable/cpu/op_scatter.cpp +++ b/kernels/portable/cpu/op_scatter.cpp @@ -32,7 +32,7 @@ void scatter_src_helper( const Tensor& src, Tensor& out) { const CTYPE* in_data = in.const_data_ptr(); - const long* index_data = index.const_data_ptr(); + const int64_t* index_data = index.const_data_ptr(); const CTYPE* src_data = src.const_data_ptr(); CTYPE* out_data = out.mutable_data_ptr(); @@ -72,7 +72,7 @@ void scatter_value_helper( CTYPE_VAL val, Tensor& out) { const CTYPE* in_data = in.const_data_ptr(); - const long* index_data = index.const_data_ptr(); + const int64_t* index_data = index.const_data_ptr(); CTYPE* out_data = out.mutable_data_ptr(); memcpy(out_data, in_data, in.nbytes()); diff --git a/kernels/portable/cpu/op_scatter_add.cpp b/kernels/portable/cpu/op_scatter_add.cpp index f9c1f7677b6..690c31342a9 100644 --- a/kernels/portable/cpu/op_scatter_add.cpp +++ b/kernels/portable/cpu/op_scatter_add.cpp @@ -23,7 +23,7 @@ namespace { template void scatter_add_helper( const CTYPE* src_data, - const long* index_data, + const int64_t* index_data, CTYPE* out_data, const Tensor& src, const Tensor& index, @@ -81,7 +81,7 @@ Tensor& scatter_add_out( ET_SWITCH_REALHBBF16_TYPES(self_type, ctx, "scatter_add.out", CTYPE, [&]() { const CTYPE* self_data = self.const_data_ptr(); - const long* index_data = index.const_data_ptr(); + const int64_t* index_data = index.const_data_ptr(); const CTYPE* src_data = src.const_data_ptr(); CTYPE* out_data = out.mutable_data_ptr(); diff --git a/kernels/portable/cpu/op_topk.cpp b/kernels/portable/cpu/op_topk.cpp index e2143ce78d5..bdea02f83bc 100644 --- a/kernels/portable/cpu/op_topk.cpp +++ b/kernels/portable/cpu/op_topk.cpp @@ -79,7 +79,7 @@ void perform_topk( elem_t* queue) { const CTYPE* const in_data = in.const_data_ptr(); CTYPE* values_data = values.mutable_data_ptr(); - long* indices_data = indices.mutable_data_ptr(); + int64_t* indices_data = indices.mutable_data_ptr(); if (in.dim() == 0) { values_data[0] = in_data[0]; diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt index 0304d751455..2e488b109c1 100644 --- a/kernels/test/CMakeLists.txt +++ b/kernels/test/CMakeLists.txt @@ -26,8 +26,8 @@ foreach(kernel ${_kernels}) set(_functions_include "#include ") add_custom_command( OUTPUT "${_wrapper_path}" - COMMAND mkdir -p ${_wrapper_dir} - COMMAND echo ${_functions_include} > "${_wrapper_path}" + COMMAND ${CMAKE_COMMAND} -E make_directory ${_wrapper_dir} + COMMAND ${CMAKE_COMMAND} -E echo ${_functions_include} > "${_wrapper_path}" DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/Functions.h" "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/NativeFunctions.h" @@ -44,7 +44,7 @@ foreach(kernel ${_kernels}) add_custom_command( OUTPUT "${_wrapper_dir}/supported_features.cpp" "${_wrapper_dir}/supported_features.h" - COMMAND mkdir -p ${_wrapper_dir} + COMMAND ${CMAKE_COMMAND} -E make_directory ${_wrapper_dir} COMMAND ${PYTHON_EXECUTABLE} kernels/test/gen_supported_features.py kernels/${_supported_features_kernel}/test/supported_features_def.yaml > @@ -73,17 +73,35 @@ foreach(kernel ${_kernels}) "${CMAKE_CURRENT_BINARY_DIR}/../../kernels/${kernel}/${kernel}_ops_lib" ) endif() + + # Copy with glob needs to be handle in a platform-specific manner. + if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows") + # The quoting here is complicated, because there are three levels of + # interpretation: CMake -> Batch -> Powershell. The invoked (batch) command + # should look like `powershell -Command "Copy-Item ... -Path \"...\" ...". + # Powershell sees `Copy-Item -Path "..." ...`. + set(_copy_headers_cmd + powershell + -Command + "Copy-Item -Path \\\"${_kernel_ops_lib_path}/*.h\\\" -Destination \\\"${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/\\\"" + ) + else() + set(_copy_headers_cmd + cp + "${_kernel_ops_lib_path}/*.h" + "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/" + ) + endif() + add_custom_command( OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/Functions.h" "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/NativeFunctions.h" "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/RegisterKernels.h" COMMAND - mkdir -p - "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/" - COMMAND - cp "${_kernel_ops_lib_path}/*.h" + ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/" + COMMAND ${_copy_headers_cmd} DEPENDS ${_kernel_ops_lib} ) endforeach() diff --git a/kernels/test/export_test_model.ps1 b/kernels/test/export_test_model.ps1 new file mode 100644 index 00000000000..d19e2a713d9 --- /dev/null +++ b/kernels/test/export_test_model.ps1 @@ -0,0 +1,24 @@ +param ( + [string]$Modules, + [string]$OutDir, + [string]$CondaEnv +) + +Set-PSDebug -Trace 1 + +# Activate the VS dev environment - needed for dynamo. Try to use vswhere to locate the install. If not, +# fall back to a reasonable guess for the build tools, which also happens to match the CLI setup. +$vswherePath = "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" +if (Test-Path $vswherePath) { + $vsInstallPath = & $vswherePath -latest -property installationPath +} else { + $vsInstallPath = "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\" +} + +& "$vsInstallPath\Common7\Tools\Launch-VsDevShell.ps1" -Arch amd64 -SkipAutomaticLocation + +conda activate $CondaEnv + +$Modules = $Modules.Replace(" ", ",") +echo "Modules: $Modules" +python -m test.models.export_program --modules "$Modules" --outdir "$OutDir" diff --git a/pytest-windows.ini b/pytest-windows.ini index 0eb30e3583d..0959318afdd 100644 --- a/pytest-windows.ini +++ b/pytest-windows.ini @@ -100,6 +100,7 @@ addopts = #extension/llm/export --deselect=extension/pybindings/test/test_pybindings.py::PybindingsTest::test_method_quantized_ops --deselect=extension/pybindings/test/test_pybindings.py::PybindingsTest::test_quantized_ops + --deselect=extension/pybindings/test/test_pybindings.py::PybindingsTest::test_program_data_separation --deselect=runtime/test/test_runtime.py::RuntimeTest::test_load_program_with_path --deselect=exir/backend/test/test_compatibility.py::TestCompatibility::test_compatibility_in_runtime --deselect=exir/backend/test/test_compatibility.py::TestCompatibility::test_compatibility_in_runtime_edge_program_manager @@ -108,6 +109,7 @@ addopts = --deselect=extension/llm/custom_ops/test_sdpa_with_kv_cache.py::SDPATestForSpeculativeDecode::test_sdpa_with_cache_seq_len_130 --deselect=devtools/inspector/tests/inspector_test.py::TestInspector::test_etrecord_populates_correct_edge_dialect_aot_intermediate_outputs --deselect=devtools/inspector/tests/inspector_test.py::TestInspector::test_etrecord_populates_correct_export_program_aot_intermediate_outputs + --deselect=runtime/test/test_runtime_etdump_gen.py::RuntimeETDumpGenTest::test_etdump_generation # run the same tests multiple times to determine their # flakiness status. Default to 50 re-runs diff --git a/runtime/core/exec_aten/testing_util/tensor_util.cpp b/runtime/core/exec_aten/testing_util/tensor_util.cpp index 35ddbe8ac15..218a64cf9dd 100644 --- a/runtime/core/exec_aten/testing_util/tensor_util.cpp +++ b/runtime/core/exec_aten/testing_util/tensor_util.cpp @@ -28,6 +28,43 @@ namespace testing { namespace { +/** + * Returns true if the two elements are close according to the description on + * `tensors_are_close()`. + * + * T must be a floating point type. Non-floating point data should be compared + * directly. + */ +template +bool element_is_close(const T a, const T b, double rtol, double atol) { + if constexpr (c10::is_reduced_floating_point_v) { + // MSVC complains about ambiguous overloads, so explicitly cast to float to + // compare. + return element_is_close( + static_cast(a), static_cast(b), rtol, atol); + } else { + if (std::isnan(a) && std::isnan(b)) { + // NaN == NaN + } else if (!std::isfinite(a) && !std::isfinite(b) && ((a > 0) == (b > 0))) { + // -Inf == -Inf + // +Inf == +Inf + } else if (rtol == 0 && atol == 0) { + // Exact comparison; avoid unnecessary math. + if (a != b) { + return false; + } + } else { + auto allowed_error = atol + std::abs(rtol * b); + auto actual_error = std::abs(a - b); + if (!std::isfinite(actual_error) || actual_error > allowed_error) { + return false; + } + } + + return true; + } +} + /** * Returns true if the two arrays are close according to the description on * `tensors_are_close()`. @@ -55,23 +92,8 @@ bool data_is_close( const auto ai = a[i]; const auto bi = b[i]; - if (std::isnan(ai) && std::isnan(bi)) { - // NaN == NaN - } else if ( - !std::isfinite(ai) && !std::isfinite(bi) && ((ai > 0) == (bi > 0))) { - // -Inf == -Inf - // +Inf == +Inf - } else if (rtol == 0 && atol == 0) { - // Exact comparison; avoid unnecessary math. - if (ai != bi) { - return false; - } - } else { - auto allowed_error = atol + std::abs(rtol * bi); - auto actual_error = std::abs(ai - bi); - if (!std::isfinite(actual_error) || actual_error > allowed_error) { - return false; - } + if (!element_is_close(ai, bi, rtol, atol)) { + return false; } } return true; diff --git a/runtime/executor/test/CMakeLists.txt b/runtime/executor/test/CMakeLists.txt index d8df1f9ea56..05d149ab1b4 100644 --- a/runtime/executor/test/CMakeLists.txt +++ b/runtime/executor/test/CMakeLists.txt @@ -17,6 +17,31 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake) +if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows") + # Use a wrapper script to set up the environment for MSVC to make Dynamo + # export work. + set(_export_program_cmd + powershell + ${EXECUTORCH_ROOT}/kernels/test/export_test_model.ps1 + -Modules + "\"ModuleAdd,ModuleAddHalf,ModuleAddMul,ModuleDynamicCatUnallocatedIO,ModuleIndex,ModuleMultipleEntry,ModuleSimpleTrain,ModuleStateful\"" + -outDir + "${CMAKE_CURRENT_BINARY_DIR}" + -CondaEnv + $ENV{CONDA_DEFAULT_ENV} + ) +else() + set(_export_program_cmd + ${PYTHON_EXECUTABLE} + -m + test.models.export_program + --modules + "ModuleAdd,ModuleAddHalf,ModuleAddMul,ModuleDynamicCatUnallocatedIO,ModuleIndex,ModuleMultipleEntry,ModuleSimpleTrain,ModuleStateful" + --outdir + "${CMAKE_CURRENT_BINARY_DIR}" + ) +endif() + add_custom_command( OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/ModuleAdd.pte" "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddHalf.pte" @@ -29,17 +54,14 @@ add_custom_command( "${CMAKE_CURRENT_BINARY_DIR}/ModuleSimpleTrain.pte" "${CMAKE_CURRENT_BINARY_DIR}/ModuleStateful.pte" "${CMAKE_CURRENT_BINARY_DIR}/delegated/ModuleAddMul.pte" - COMMAND - ${PYTHON_EXECUTABLE} -m test.models.export_program --modules - "ModuleAdd,ModuleAddHalf,ModuleAddMul,ModuleDynamicCatUnallocatedIO,ModuleIndex,ModuleMultipleEntry,ModuleSimpleTrain,ModuleStateful" - --outdir "${CMAKE_CURRENT_BINARY_DIR}" + COMMAND ${_export_program_cmd} COMMAND ${PYTHON_EXECUTABLE} -m test.models.export_program --modules "ModuleAddMul" --external-constants --outdir "${CMAKE_CURRENT_BINARY_DIR}" COMMAND ${PYTHON_EXECUTABLE} -m test.models.export_delegated_program --modules "ModuleAddMul" --backend_id "StubBackend" --outdir - "${CMAKE_CURRENT_BINARY_DIR}/delegated/" || true + "${CMAKE_CURRENT_BINARY_DIR}/delegated/" WORKING_DIRECTORY ${EXECUTORCH_ROOT} ) diff --git a/runtime/kernel/test/CMakeLists.txt b/runtime/kernel/test/CMakeLists.txt index c70ec5d135b..a8166017e53 100644 --- a/runtime/kernel/test/CMakeLists.txt +++ b/runtime/kernel/test/CMakeLists.txt @@ -39,12 +39,6 @@ add_test(kernel_runtime_context_test kernel_runtime_context_test) add_executable( operator_registry_max_kernel_num_test operator_registry_max_kernel_num_test.cpp - ../operator_registry.cpp - ../../core/evalue.cpp - ../../platform/abort.cpp - ../../platform/log.cpp - ../../platform/runtime.cpp - ../../platform/default/posix.cpp ) target_link_libraries( operator_registry_max_kernel_num_test GTest::gtest GTest::gtest_main diff --git a/test/end2end/exported_module.py b/test/end2end/exported_module.py index e5630b8e89f..750b9097335 100644 --- a/test/end2end/exported_module.py +++ b/test/end2end/exported_module.py @@ -187,7 +187,6 @@ def __init__(self, method): if method_name_to_dynamic_shapes else None ), - strict=True, ) exec_prog = to_edge( diff --git a/test/models/export_delegated_program.py b/test/models/export_delegated_program.py index 8f7c388d7ad..98f4b0b9b36 100644 --- a/test/models/export_delegated_program.py +++ b/test/models/export_delegated_program.py @@ -155,9 +155,9 @@ def forward(self, *args, **kwargs): if method_name != "forward": # Only require wrapper module if we're exporting a specific method other than forward. - exported_program = export(WrapperModule(eager_module), args=inputs, strict=True) + exported_program = export(WrapperModule(eager_module), args=inputs) else: - exported_program = export(eager_module, args=inputs, strict=True) + exported_program = export(eager_module, args=inputs) edge_config = EdgeCompileConfig(_check_ir_validity=False) et_config = exir.ExecutorchBackendConfig( @@ -178,7 +178,7 @@ def forward(self, *args, **kwargs): module=tagged_module, gen_tag_fn=lambda x: module_class.__name__, ) - exported_program = export(tagged_module, args=inputs, strict=True) + exported_program = export(tagged_module, args=inputs) executorch_program = to_edge_transform_and_lower( exported_program, compile_config=edge_config, @@ -205,7 +205,7 @@ def forward(self, *args, **kwargs): composite_module(*inputs) executorch_program = to_edge( - export(composite_module, args=inputs, strict=True) + export(composite_module, args=inputs) ).to_executorch(config=et_config) return executorch_program diff --git a/tools/cmake/Utils.cmake b/tools/cmake/Utils.cmake index 1e0671eb920..77918ebbf2e 100644 --- a/tools/cmake/Utils.cmake +++ b/tools/cmake/Utils.cmake @@ -62,6 +62,8 @@ endfunction() function(target_link_options_gc_sections target_name) if(APPLE) target_link_options(${target_name} PRIVATE "LINKER:-dead_strip") + elseif(WIN32) + target_link_options(${target_name} PRIVATE "LINKER:/OPT:REF") else() target_link_options(${target_name} PRIVATE "LINKER:--gc-sections") endif() diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake index 16f4245f6bc..fb0dc0a4ade 100644 --- a/tools/cmake/preset/default.cmake +++ b/tools/cmake/preset/default.cmake @@ -176,36 +176,6 @@ define_overridable_option( ${_default_executorch_build_cpuinfo} ) -# Threadpool size options. At most one can be specified. Note that the default -# is managed in threadpool.cpp to allow the user to specify an alternate mode -# without needing to explicitly set the default to off. -define_overridable_option( - EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES - "Set the number of threads used for CPU parallel computation equal to the number of performant CPU cores." - BOOL - OFF -) -define_overridable_option( - EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES - "Set the number of threads used for CPU parallel computation equal to the number of logical CPU cores." - BOOL - OFF -) - -check_required_options_on( - IF_ON EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES REQUIRES - EXECUTORCH_BUILD_PTHREADPOOL EXECUTORCH_BUILD_CPUINFO -) -check_required_options_on( - IF_ON EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES REQUIRES - EXECUTORCH_BUILD_PTHREADPOOL EXECUTORCH_BUILD_CPUINFO -) - -check_conflicting_options_on( - IF_ON EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES CONFLICTS_WITH - EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES -) - # TODO(jathu): move this to platform specific presets when created set(_default_executorch_build_executor_runner ON) if(APPLE AND "${SDK_NAME}" STREQUAL "iphoneos")