diff --git a/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/_index.md b/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/_index.md new file mode 100644 index 0000000000..336051e2d2 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/_index.md @@ -0,0 +1,48 @@ +--- +title: Learn how to build and test Kleidicv on Mac OS + +minutes_to_complete: 30 + +who_is_this_for: This is an introductory topic for software developers to learn how + to to build and test kleidicv on Mac OS. + +learning_objectives: +- Install and compile kleidiCV on macOS +- Run KleidiCV example tests +- Enable SME build and verify SME backend + +prerequisites: +- An M4 Mac OS computer. + +author: Jett Zhou + +### Tags +skilllevels: Introductory +subjects: SME +armips: + - ARMv9-A +tools_software_languages: + - kleidiCV and C/C++ +operatingsystems: + - Mac OS + + + +further_reading: + - resource: + title: kleidicv doc + link: https://gitlab.arm.com/kleidi/kleidicv/-/tree/0.6.0/doc?ref_type=tags + type: documentation + - resource: + title: Announcing Arm KleidiCV 0.1 + link: https://developer.arm.com/community/arm-community-blogs/b/ai-blog/posts/kleidicv + type: blog + + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/_next-steps.md b/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- diff --git a/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/build-1.md b/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/build-1.md new file mode 100644 index 0000000000..c3f50d320a --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/build-1.md @@ -0,0 +1,175 @@ +--- +title: Download and Build for the Kleidicv Software +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Introduction + +Arm KleidiCV is an open-source library of optimized performance-critical routines for Arm CPUs. It is designed for integrating into any CV framework to enable best performance for CV workloads on Arm, with no action needed by application developers. + +Each KleidiCV function has three different implementations targeting Neon, SVE2 (Scalable Vector Extension) or Streaming SVE & SME2 (Scalable Matrix Extension). KleidiCV will automatically detect what hardware it's running on and select the best implementation accordingly.KleidiCV can be used as a lightweight standalone image processing library. Alternatively KleidiCV can be used seamlessly as part of the extremely popular OpenCV library. + +Since the Apple M4 family is based on the ARMv9.2‑A architecture, it supports the Scalable Matrix Extension (SME) (or a variant thereof) for matrix-compute acceleration. we will demostrate the build and do run test of the kleidicv, understand how the backend implementation is called for the KleidiCV functions. + +## Host Environment + +The host machine utilized is a MacBook Pro (Apple M4 Pro), and the operating system version is detailed below: + +```bash +ProductName: macOS +ProductVersion: 15.5 +BuildVersion: 24F74 +``` + +CMake is available for installation through Homebrew if it is not already installed on the host machine. + +```bash +brew install cmake +``` + +The host architecture feature can be verified as outlined below, confirming that FEAT_SME is supported: + +```bash +sysctl -a | grep hw.optional.arm.FEAT +hw.optional.arm.FEAT_CRC32: 1 +hw.optional.arm.FEAT_FlagM: 1 +hw.optional.arm.FEAT_FlagM2: 1 +hw.optional.arm.FEAT_FHM: 1 +hw.optional.arm.FEAT_DotProd: 1 +hw.optional.arm.FEAT_SHA3: 1 +hw.optional.arm.FEAT_RDM: 1 +hw.optional.arm.FEAT_LSE: 1 +hw.optional.arm.FEAT_SHA256: 1 +hw.optional.arm.FEAT_SHA512: 1 +hw.optional.arm.FEAT_SHA1: 1 +hw.optional.arm.FEAT_AES: 1 +hw.optional.arm.FEAT_PMULL: 1 +hw.optional.arm.FEAT_SPECRES: 0 +hw.optional.arm.FEAT_SPECRES2: 0 +hw.optional.arm.FEAT_SB: 1 +hw.optional.arm.FEAT_FRINTTS: 1 +hw.optional.arm.FEAT_PACIMP: 1 +hw.optional.arm.FEAT_LRCPC: 1 +hw.optional.arm.FEAT_LRCPC2: 1 +hw.optional.arm.FEAT_FCMA: 1 +hw.optional.arm.FEAT_JSCVT: 1 +hw.optional.arm.FEAT_PAuth: 1 +hw.optional.arm.FEAT_PAuth2: 1 +hw.optional.arm.FEAT_FPAC: 1 +hw.optional.arm.FEAT_FPACCOMBINE: 1 +hw.optional.arm.FEAT_DPB: 1 +hw.optional.arm.FEAT_DPB2: 1 +hw.optional.arm.FEAT_BF16: 1 +hw.optional.arm.FEAT_EBF16: 0 +hw.optional.arm.FEAT_I8MM: 1 +hw.optional.arm.FEAT_WFxT: 1 +hw.optional.arm.FEAT_RPRES: 1 +hw.optional.arm.FEAT_CSSC: 0 +hw.optional.arm.FEAT_HBC: 0 +hw.optional.arm.FEAT_ECV: 1 +hw.optional.arm.FEAT_AFP: 1 +hw.optional.arm.FEAT_LSE2: 1 +hw.optional.arm.FEAT_CSV2: 1 +hw.optional.arm.FEAT_CSV3: 1 +hw.optional.arm.FEAT_DIT: 1 +hw.optional.arm.FEAT_FP16: 1 +hw.optional.arm.FEAT_SSBS: 0 +hw.optional.arm.FEAT_BTI: 1 +hw.optional.arm.FEAT_SME: 1 +hw.optional.arm.FEAT_SME2: 1 +hw.optional.arm.FEAT_SME_F64F64: 1 +hw.optional.arm.FEAT_SME_I16I64: 1 +``` + +## Download the Software + +To set up KleidiCV and OpenCV, first download the source code from GitLab. In your $WORKSPACE directory, clone KleidiCV using the v0.6.0 release tag. + +```bash +cd $WORKSPACE +git clone -b 0.6.0 https://git.gitlab.arm.com/kleidi/kleidicv.git +``` + +Clone the OpenCV repository into $WORKSPACE using the v4.12.0 release tag. + +```bash +cd $WORKSPACE +git clone https://github.com/opencv/opencv.git +git checkout 4.12.0 +``` + +Apply the patch for OpenCV version 4.12. + +```bash +patch -p1 < ../kleidicv/adapters/opencv/opencv-4.12.patch +patch -p1 < ../kleidicv/adapters/opencv/extra_benchmarks/opencv-4.12.patch +``` + + +## Build Options + +* KLEIDICV_ENABLE_SVE2 - Enable Scalable Vector Extension 2 code paths. This is on by default for some popular compilers known to support SVE2 but otherwise off by default. + - KLEIDICV_LIMIT_SVE2_TO_SELECTED_ALGORITHMS - Limit Scalable Vector Extension 2 code paths to cases where it is expected to provide a benefit over other code paths. On by default. Has no effect if KLEIDICV_ENABLE_SVE2 is off. +* KLEIDICV_BENCHMARK - Enable building KleidiCV benchmarks. The benchmarks use Google Benchmark which will be downloaded automatically. Off by default. +* KLEIDICV_ENABLE_SME2 - Enable Scalable Matrix Extension 2 and Streaming Scalable Vector Extension code paths. Off by default while the ACLE SME specification is in beta. + - KLEIDICV_LIMIT_SME2_TO_SELECTED_ALGORITHMS - Limit Scalable Matrix Extension 2 code paths to cases where it is expected to provide a benefit over other code paths. On by default. Has no effect if KLEIDICV_ENABLE_SME2 is off. + +{{% notice Note %}} +Normally, if our tests show SVE2 or SME2 are slower than NEON, we default to NEON (unless overridden with -DKLEIDICV_LIMIT_SVE2_TO_SELECTED_ALGORITHMS=OFF or -DKLEIDICV_LIMIT_SME2_TO_SELECTED_ALGORITHMS=OFF). +{{% /notice %}} + +## Build the KleidiCV standalone + +Use the following command to build kleidicv natively: + +```bash +cmake -S $WORKSPCE/kleidicv \ + -B build-kleidicv-benchmark-SME \ + -DKLEIDICV_ENABLE_SME2=ON \ + -DKLEIDICV_LIMIT_SME2_TO_SELECTED_ALGORITHMS=OFF \ + -DKLEIDICV_BENCHMARK=ON \ + -DCMAKE_BUILD_TYPE=Release + +cmake --build build-kleidicv-benchmark-SME --parallel +``` +Once the build completes, the kleidicv API and framework tests appear below: + +```bash +./build-kleidicv-benchmark-SME/test/framework/kleidicv-framework-test +./build-kleidicv-benchmark-SME/test/api/kleidicv-api-test +``` + +The Kleidicv benchmark test is available as follows: + +```bash +./build-kleidicv-benchmark-SME/benchmark/kleidicv-benchmark +``` +## Build the OpenCV with KleidiCV + +The following command can be used to build OpenCV with kleidicv: + +```bash +cmake -S $workspace/opencv / + -B build-opencv-kleidicv-sme / + -DWITH_KLEIDICV=ON / + -DKLEIDICV_ENABLE_SME2=ON / + -DKLEIDICV_SOURCE_PATH=$workspace/kleidicv / + -DBUILD_LIST=imgproc,core,ts / + -DBUILD_SHARED_LIBS=OFF / + -DBUILD_TESTS=ON / + -DBUILD_PERF_TEST=ON / + -DWITH_PNG=OFF + +cmake --build build-opencv-kleidicv-sme --parallel --target opencv_perf_imgproc opencv_perf_core +``` + +Upon completion of the build process, the OpenCV test binary will be available at the following location: + +```bash +build-opencv-kleidicv-sme/bin/opencv_perf_core +build-opencv-kleidicv-sme/bin/opencv_perf_imgproc +``` + diff --git a/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/run-test-2.md b/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/run-test-2.md new file mode 100644 index 0000000000..b3aa19c24b --- /dev/null +++ b/content/learning-paths/laptops-and-desktops/kleidicv-on-mac/run-test-2.md @@ -0,0 +1,517 @@ +--- +title: Test the Kleidicv and verify SME backend support +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Run the Test + +Once the build steps are complete, run the KleidiCV and OpenCV tests. + +* Run the KleidiCV test + +The KleidiCV API test verifies the public C++ API. You can run it as shown below, though the full test log is not included: + +```output +./test/api/kleidicv-api-test +Vector length is set to 16 bytes. +Seed is set to 2542467924. +[==========] Running 3703 tests from 141 test suites. +[----------] Global test environment set-up. +[----------] 9 tests from SaturatingAddAbsWithThresholdTest/0, where TypeParam = short +[ RUN ] SaturatingAddAbsWithThresholdTest/0.TestPositive +[ OK ] SaturatingAddAbsWithThresholdTest/0.TestPositive (0 ms) +[ RUN ] SaturatingAddAbsWithThresholdTest/0.TestNegative +[ OK ] SaturatingAddAbsWithThresholdTest/0.TestNegative (0 ms) +[ RUN ] SaturatingAddAbsWithThresholdTest/0.TestMin +[ OK ] SaturatingAddAbsWithThresholdTest/0.TestMin (0 ms) +[ RUN ] SaturatingAddAbsWithThresholdTest/0.TestZero +[ OK ] SaturatingAddAbsWithThresholdTest/0.TestZero (0 ms) +[ RUN ] SaturatingAddAbsWithThresholdTest/0.TestMax +[ OK ] SaturatingAddAbsWithThresholdTest/0.TestMax (0 ms) +[ RUN ] SaturatingAddAbsWithThresholdTest/0.NullPointer +[ OK ] SaturatingAddAbsWithThresholdTest/0.NullPointer (0 ms) +[ RUN ] SaturatingAddAbsWithThresholdTest/0.Misalignment +[ OK ] SaturatingAddAbsWithThresholdTest/0.Misalignment (0 ms) +[ RUN ] SaturatingAddAbsWithThresholdTest/0.ZeroImageSize +[ OK ] SaturatingAddAbsWithThresholdTest/0.ZeroImageSize (0 ms) +[ RUN ] SaturatingAddAbsWithThresholdTest/0.OversizeImage +[ OK ] SaturatingAddAbsWithThresholdTest/0.OversizeImage (0 ms) +[----------] 9 tests from SaturatingAddAbsWithThresholdTest/0 (0 ms total) + +[----------] 4 tests from BitwiseAnd/0, where TypeParam = unsigned char +[ RUN ] BitwiseAnd/0.API +[ OK ] BitwiseAnd/0.API (0 ms) +[ RUN ] BitwiseAnd/0.Misalignment +[ OK ] BitwiseAnd/0.Misalignment (0 ms) +[ RUN ] BitwiseAnd/0.ZeroImageSize +[ OK ] BitwiseAnd/0.ZeroImageSize (0 ms) +[ RUN ] BitwiseAnd/0.OversizeImage +[ OK ] BitwiseAnd/0.OversizeImage (0 ms) +[----------] 4 tests from BitwiseAnd/0 (0 ms total)``` +``` +{{% notice Note %}} +Currently Apple xcode is built on clang17, and clang-1700.3.19.1 has an SME related code generation bug which causes float ResizeLinear API tests to fail. +{{% /notice %}} + + +* Run the OpenCV test + +Upon completing the build steps for OpenCV with KleidiCV, the test binaries will be located in the "build-opencv-kleidicv-sme/bin/" directory. For example, `opencv_perf_imgproc` serves as OpenCV’s performance benchmark suite for the image processing (imgproc) module, evaluating both execution speed and throughput. + +Testing can be customized by selecting specific test filters and parameters using the "`--gtest_filter`" and "`--gtest_param_filter`" options, respectively. For instance, to run the Gaussian blur 5×5 performance tests three times with the following parameter settings: +- Image size: 1920x1080 (Full HD) +- Image type: 8UC1 (8-bit unsigned, single channel, grayscale) +- Border type: BORDER_REPLICATE + +Additional test cases are available in [benchmarks.txt](https://gitlab.arm.com/kleidi/kleidicv/-/blob/0.6.0/scripts/benchmark/benchmarks.txt?ref_type=tags). + +The command for running the test is as follows: + +```bash +./opencv_perf_imgproc \ + --gtest_filter='*gaussianBlur5x5/*' \ + --gtest_param_filter='(1920x1080, 8UC1, BORDER_REPLICATE)' \ + --gtest_repeat=3 +``` + +The output will appear as follows: + +```output +./opencv_perf_imgproc --gtest_filter='*gaussianBlur5x5/*' --gtest_param_filter='(1920x1080, 8UC1, BORDER_REPLICATE)' --gtest_repeat=3 +[ERROR:0@0.001] global persistence.cpp:566 open Can't open file: 'imgproc.xml' in read mode +TEST: Skip tests with tags: 'mem_6gb', 'verylong' +CTEST_FULL_OUTPUT +OpenCV version: 4.12.0 +OpenCV VCS version: 4.12.0-2-g2eea907534 +Build type: Release +Compiler: /usr/bin/c++ (ver 17.0.0.17000013) +Algorithm hint: ALGO_HINT_ACCURATE +HAL: YES (carotene (ver 0.0.1) KleidiCV (ver 0.6.0)) +Parallel framework: gcd (nthreads=12) +CPU features: NEON FP16 NEON_DOTPROD NEON_FP16 *NEON_BF16 +OpenCL Platforms: + Apple + iGPU: Apple M4 Pro (OpenCL 1.2 ) +Current OpenCL device: + Type = iGPU + Name = Apple M4 Pro + Version = OpenCL 1.2 + Driver version = 1.2 1.0 + Address bits = 64 + Compute units = 16 + Max work group size = 256 + Local memory size = 32 KB + Max memory allocation size = 3 GB + Double support = No + Half support = No + Host unified memory = Yes + Device extensions: + cl_APPLE_SetMemObjectDestructor + cl_APPLE_ContextLoggingFunctions + cl_APPLE_clut + cl_APPLE_query_kernel_names + cl_APPLE_gl_sharing + cl_khr_gl_event + cl_khr_byte_addressable_store + cl_khr_global_int32_base_atomics + cl_khr_global_int32_extended_atomics + cl_khr_local_int32_base_atomics + cl_khr_local_int32_extended_atomics + cl_khr_3d_image_writes + cl_khr_image2d_from_buffer + cl_khr_depth_images + Has AMD Blas = No + Has AMD Fft = No + Preferred vector width char = 1 + Preferred vector width short = 1 + Preferred vector width int = 1 + Preferred vector width long = 1 + Preferred vector width float = 1 + Preferred vector width double = 1 + Preferred vector width half = 0 + +Repeating all tests (iteration 1) . . . + +Note: Google Test filter = *gaussianBlur5x5/* +Note: Google Test parameter filter = (1920x1080, 8UC1, BORDER_REPLICATE) +[==========] Running 1 test from 1 test case. +[----------] Global test environment set-up. +[----------] 1 test from Size_MatType_BorderType_gaussianBlur5x5 +[ RUN ] Size_MatType_BorderType_gaussianBlur5x5.gaussianBlur5x5/80, where GetParam() = (1920x1080, 8UC1, BORDER_REPLICATE) +[ PERFSTAT ] (samples=100 mean=0.18 median=0.18 min=0.16 stddev=0.02 (12.7%)) +[ OK ] Size_MatType_BorderType_gaussianBlur5x5.gaussianBlur5x5/80 (22 ms) +[----------] 1 test from Size_MatType_BorderType_gaussianBlur5x5 (22 ms total) + +[----------] Global test environment tear-down +[==========] 1 test from 1 test case ran. (22 ms total) +[ PASSED ] 1 test. + +Repeating all tests (iteration 2) . . . + +Note: Google Test filter = *gaussianBlur5x5/* +Note: Google Test parameter filter = (1920x1080, 8UC1, BORDER_REPLICATE) +[==========] Running 1 test from 1 test case. +[----------] Global test environment set-up. +[----------] 1 test from Size_MatType_BorderType_gaussianBlur5x5 +[ RUN ] Size_MatType_BorderType_gaussianBlur5x5.gaussianBlur5x5/80, where GetParam() = (1920x1080, 8UC1, BORDER_REPLICATE) +[ PERFSTAT ] (samples=100 mean=0.18 median=0.17 min=0.16 stddev=0.04 (23.7%)) +[ OK ] Size_MatType_BorderType_gaussianBlur5x5.gaussianBlur5x5/80 (22 ms) +[----------] 1 test from Size_MatType_BorderType_gaussianBlur5x5 (22 ms total) + +[----------] Global test environment tear-down +[==========] 1 test from 1 test case ran. (22 ms total) +[ PASSED ] 1 test. + +Repeating all tests (iteration 3) . . . + +Note: Google Test filter = *gaussianBlur5x5/* +Note: Google Test parameter filter = (1920x1080, 8UC1, BORDER_REPLICATE) +[==========] Running 1 test from 1 test case. +[----------] Global test environment set-up. +[----------] 1 test from Size_MatType_BorderType_gaussianBlur5x5 +[ RUN ] Size_MatType_BorderType_gaussianBlur5x5.gaussianBlur5x5/80, where GetParam() = (1920x1080, 8UC1, BORDER_REPLICATE) +[ PERFSTAT ] (samples=100 mean=0.19 median=0.17 min=0.15 stddev=0.07 (36.1%)) +[ OK ] Size_MatType_BorderType_gaussianBlur5x5.gaussianBlur5x5/80 (23 ms) +[----------] 1 test from Size_MatType_BorderType_gaussianBlur5x5 (23 ms total) + +[----------] Global test environment tear-down +[==========] 1 test from 1 test case ran. (23 ms total) +[ PASSED ] 1 test. +``` + + + +## KleidiCV Multiversion Backend Support + +The KleidiCV library detects the platform hardware at runtime and selects the backend implementation based on the following priority: + +* SME2 backend implementation +* SME backend implementation +* SVE backend implementation +* NEON backend implementation + +```C { line_numbers = "true" } +#define KLEIDICV_MULTIVERSION_C_API(api_name, neon_impl, sve2_impl, sme_impl, \ + sme2_impl) \ + static decltype(neon_impl) api_name##_resolver() { \ + [[maybe_unused]] KLEIDICV_TARGET_NAMESPACE::HwCaps hwcaps = \ + KLEIDICV_TARGET_NAMESPACE::get_hwcaps(); \ + KLEIDICV_SME2_RESOLVE(sme2_impl); \ + KLEIDICV_SME_RESOLVE(sme_impl); \ + KLEIDICV_SVE2_RESOLVE(sve2_impl); \ + return neon_impl; \ + } \ + extern "C" { \ + decltype(neon_impl) api_name = api_name##_resolver(); \ + } +``` +It verifies SME support using the query "hw.optional.arm.FEAT_SME" as follows: + +```C { line_numbers = "true" } +#define KLEIDICV_SME_RESOLVE(sme_impl) \ + if (!std::is_null_pointer_v && \ + KLEIDICV_TARGET_NAMESPACE::query_sysctl("hw.optional.arm.FEAT_SME")) { \ + return sme_impl; \ + } +``` +It verifies SME2 support using the query "hw.optional.arm.FEAT_SME2" as follows: + +```C { line_numbers = "true" } +#define KLEIDICV_SME2_RESOLVE(sme2_impl) \ + if (!std::is_null_pointer_v && \ + KLEIDICV_TARGET_NAMESPACE::query_sysctl("hw.optional.arm.FEAT_SME2")) { \ + return sme2_impl; \ + } +``` + + +## Enable debug information for backend implementation at runtime + +To incorporate dump information for multi-version backend support during runtime testing, please update "kleidicv/include/kleidicv/dispatch.h" as outlined below: + + +```C { line_numbers = "true" } +diff --git a/kleidicv/include/kleidicv/dispatch.h b/kleidicv/include/kleidicv/dispatch.h +index cc6ee01..44c98a5 100644 +--- a/kleidicv/include/kleidicv/dispatch.h ++++ b/kleidicv/include/kleidicv/dispatch.h +@@ -1,10 +1,11 @@ +-// SPDX-FileCopyrightText: 2023 - 2025 Arm Limited and/or its affiliates ++// SPDX-FileCopyrightText: 2024 - 2025 Arm Limited and/or its affiliates + // + // SPDX-License-Identifier: Apache-2.0 + + #ifndef KLEIDICV_DISPATCH_H + #define KLEIDICV_DISPATCH_H + ++#include + #include "kleidicv/config.h" + + #if KLEIDICV_ENABLE_SME2 || KLEIDICV_ENABLE_SME || KLEIDICV_ENABLE_SVE2 +@@ -33,6 +34,7 @@ static bool query_sysctl(const char* attribute_name) { + #define KLEIDICV_SVE2_RESOLVE(sve2_impl) \ + if (!std::is_null_pointer_v && \ + KLEIDICV_TARGET_NAMESPACE::query_sysctl("hw.optional.arm.FEAT_SVE2")) { \ ++ printf("kleidicv API:: %s,SVE2 backend. \n", __func__); \ + return sve2_impl; \ + } + #else +@@ -43,6 +45,7 @@ static bool query_sysctl(const char* attribute_name) { + #define KLEIDICV_SME_RESOLVE(sme_impl) \ + if (!std::is_null_pointer_v && \ + KLEIDICV_TARGET_NAMESPACE::query_sysctl("hw.optional.arm.FEAT_SME")) { \ ++ printf("kleidicv API:: %s,SME backend. \n", __func__); \ + return sme_impl; \ + } + #else +@@ -53,6 +56,7 @@ static bool query_sysctl(const char* attribute_name) { + #define KLEIDICV_SME2_RESOLVE(sme2_impl) \ + if (!std::is_null_pointer_v && \ + KLEIDICV_TARGET_NAMESPACE::query_sysctl("hw.optional.arm.FEAT_SME2")) { \ ++ printf("kleidicv API:: %s,SME2 backend. \n", __func__); \ + return sme2_impl; \ + } + #else +@@ -67,6 +71,7 @@ static bool query_sysctl(const char* attribute_name) { + KLEIDICV_SME2_RESOLVE(sme2_impl); \ + KLEIDICV_SME_RESOLVE(sme_impl); \ + KLEIDICV_SVE2_RESOLVE(sve2_impl); \ ++ printf("kleidicv API:: %s,NEON backend. \n", __func__); \ + return neon_impl; \ + } \ + extern "C" { \ +``` + +## Neon or SME backend data extraction on a MacBook + +After making the change and rebuilding for testing, you can display the SME backend usage summary as follows: + +```output +./kleidicv-benchmark +kleidicv API:: kleidicv_min_max_u8_resolver,SME backend. +kleidicv API:: kleidicv_min_max_s8_resolver,SME backend. +kleidicv API:: kleidicv_min_max_u16_resolver,SME backend. +kleidicv API:: kleidicv_min_max_s16_resolver,SME backend. +kleidicv API:: kleidicv_min_max_s32_resolver,SME backend. +kleidicv API:: kleidicv_min_max_f32_resolver,SME backend. +kleidicv API:: kleidicv_min_max_loc_u8_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_absdiff_u8_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_absdiff_s8_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_absdiff_u16_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_absdiff_s16_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_absdiff_s32_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_add_abs_with_threshold_s16_resolver,SME backend. +kleidicv API:: kleidicv_saturating_add_s8_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_add_u8_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_add_s16_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_add_u16_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_add_s32_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_add_u32_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_add_s64_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_add_u64_resolver,NEON backend. +kleidicv API:: kleidicv_compare_equal_u8_resolver,NEON backend. +kleidicv API:: kleidicv_compare_greater_u8_resolver,NEON backend. +kleidicv API:: kleidicv_exp_f32_resolver,SME backend. +kleidicv API:: kleidicv_in_range_u8_resolver,NEON backend. +kleidicv API:: kleidicv_in_range_f32_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_multiply_u8_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_multiply_s8_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_multiply_u16_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_multiply_s16_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_multiply_s32_resolver,NEON backend. +kleidicv API:: kleidicv_rotate_resolver,NEON backend. +kleidicv API:: kleidicv_scale_u8_resolver,NEON backend. +kleidicv API:: kleidicv_scale_f32_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_sub_s8_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_sub_u8_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_sub_s16_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_sub_u16_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_sub_s32_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_sub_u32_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_sub_s64_resolver,NEON backend. +kleidicv API:: kleidicv_saturating_sub_u64_resolver,NEON backend. +kleidicv API:: kleidicv_sum_f32_resolver,SME backend. +kleidicv API:: kleidicv_threshold_binary_u8_resolver,SME backend. +kleidicv API:: kleidicv_transpose_resolver,NEON backend. +kleidicv API:: kleidicv_f32_to_s8_resolver,SME backend. +kleidicv API:: kleidicv_f32_to_u8_resolver,SME backend. +kleidicv API:: kleidicv_s8_to_f32_resolver,SME backend. +kleidicv API:: kleidicv_u8_to_f32_resolver,SME backend. +kleidicv API:: kleidicv_gray_to_rgb_u8_resolver,SME backend. +kleidicv API:: kleidicv_gray_to_rgba_u8_resolver,SME backend. +kleidicv API:: kleidicv_merge_resolver,NEON backend. +kleidicv API:: kleidicv_rgb_to_bgr_u8_resolver,SME backend. +kleidicv API:: kleidicv_rgba_to_bgra_u8_resolver,SME backend. +kleidicv API:: kleidicv_rgb_to_bgra_u8_resolver,SME backend. +kleidicv API:: kleidicv_rgb_to_rgba_u8_resolver,SME backend. +kleidicv API:: kleidicv_rgba_to_bgr_u8_resolver,SME backend. +kleidicv API:: kleidicv_rgba_to_rgb_u8_resolver,SME backend. +kleidicv API:: kleidicv_rgb_to_yuv420_p_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_rgba_to_yuv420_p_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_bgr_to_yuv420_p_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_bgra_to_yuv420_p_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_rgb_to_yuv420_sp_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_rgba_to_yuv420_sp_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_bgr_to_yuv420_sp_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_bgra_to_yuv420_sp_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_rgb_to_yuv_u8_resolver,SME backend. +kleidicv API:: kleidicv_bgr_to_yuv_u8_resolver,SME backend. +kleidicv API:: kleidicv_rgba_to_yuv_u8_resolver,SME backend. +kleidicv API:: kleidicv_bgra_to_yuv_u8_resolver,SME backend. +kleidicv API:: kleidicv_split_resolver,NEON backend. +kleidicv API:: kleidicv_yuv_p_to_rgb_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_yuv_p_to_bgr_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_yuv_p_to_rgba_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_yuv_p_to_bgra_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_yuv_sp_to_rgb_u8_resolver,SME backend. +kleidicv API:: kleidicv_yuv_sp_to_bgr_u8_resolver,SME backend. +kleidicv API:: kleidicv_yuv_sp_to_rgba_u8_resolver,SME backend. +kleidicv API:: kleidicv_yuv_sp_to_bgra_u8_resolver,SME backend. +kleidicv API:: kleidicv_yuv_to_rgb_u8_resolver,SME backend. +kleidicv API:: kleidicv_yuv_to_bgr_u8_resolver,SME backend. +kleidicv API:: kleidicv_yuv_to_bgra_u8_resolver,SME backend. +kleidicv API:: kleidicv_yuv_to_rgba_u8_resolver,SME backend. +kleidicv API:: kleidicv_blur_and_downsample_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_gaussian_blur_fixed_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_gaussian_blur_arbitrary_stripe_u8_resolver,NEON backend. +kleidicv API:: kleidicv_median_blur_sorting_network_stripe_s8_resolver,SME backend. +kleidicv API:: kleidicv_median_blur_sorting_network_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_median_blur_sorting_network_stripe_u16_resolver,SME backend. +kleidicv API:: kleidicv_median_blur_sorting_network_stripe_s16_resolver,SME backend. +kleidicv API:: kleidicv_median_blur_sorting_network_stripe_u32_resolver,SME backend. +kleidicv API:: kleidicv_median_blur_sorting_network_stripe_s32_resolver,SME backend. +kleidicv API:: kleidicv_median_blur_sorting_network_stripe_f32_resolver,SME backend. +kleidicv API:: kleidicv_median_blur_small_hist_stripe_u8_resolver,NEON backend. +kleidicv API:: kleidicv_median_blur_large_hist_stripe_u8_resolver,NEON backend. +kleidicv API:: kleidicv_scharr_interleaved_stripe_s16_u8_resolver,SME backend. +kleidicv API:: kleidicv_separable_filter_2d_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_separable_filter_2d_stripe_u16_resolver,SME backend. +kleidicv API:: kleidicv_separable_filter_2d_stripe_s16_resolver,SME backend. +kleidicv API:: kleidicv_sobel_3x3_horizontal_stripe_s16_u8_resolver,SME backend. +kleidicv API:: kleidicv_sobel_3x3_vertical_stripe_s16_u8_resolver,SME backend. +kleidicv API:: kleidicv_bitwise_and_resolver,NEON backend. +kleidicv API:: kleidicv_dilate_u8_resolver,SME backend. +kleidicv API:: kleidicv_erode_u8_resolver,SME backend. +kleidicv API:: kleidicv_resize_to_quarter_u8_resolver,SME backend. +kleidicv API:: kleidicv_resize_linear_stripe_u8_resolver,SME backend. +kleidicv API:: kleidicv_resize_linear_stripe_f32_resolver,SME backend. +kleidicv API:: kleidicv_remap_s16_u8_resolver,NEON backend. +kleidicv API:: kleidicv_remap_s16_u16_resolver,NEON backend. +kleidicv API:: kleidicv_remap_s16point5_u8_resolver,NEON backend. +kleidicv API:: kleidicv_remap_s16point5_u16_resolver,NEON backend. +kleidicv API:: kleidicv_remap_f32_u8_resolver,NEON backend. +kleidicv API:: kleidicv_remap_f32_u16_resolver,NEON backend. +kleidicv API:: kleidicv_warp_perspective_stripe_u8_resolver,NEON backend. +``` + +## Use lldb to check SME backend implementation + +To perform source-level debugging during the build process, you should change the build type from "Release" to "Debug," as demonstrated in the following example: + +```bash +cmake -S $WORKSPCE/kleidicv \ + -B build-kleidicv-benchmark-SME \ + -DKLEIDICV_ENABLE_SME2=ON \ + -DKLEIDICV_LIMIT_SME2_TO_SELECTED_ALGORITHMS=OFF \ + -DKLEIDICV_BENCHMARK=ON \ + -DCMAKE_BUILD_TYPE=Debug + +cmake --build build-kleidicv-benchmark-SME --parallel +``` + +Use the lldb debug tool to set breakpoints during API testing and verify if the SME backend implementation is invoked. To view the function call backtrace, run the "bt" command as shown below: + +```C + +lldb kleidicv-api-test +(lldb) target create "kleidicv-api-test" +Current executable set to '/Users/Shared/workspace/build-kleidicv-benchmark-SME-debug/test/api/kleidicv-api-test' (arm64). +(lldb) b saturating_add_abs_with_threshold +Breakpoint 1: 2 locations. +(lldb) run +Process 82381 launched: '/Users/Shared/workspace/build-kleidicv-benchmark-SME-debug/test/api/kleidicv-api-test' (arm64) +Vector length is set to 16 bytes. +Seed is set to 3168213869. +[==========] Running 3703 tests from 141 test suites. +[----------] Global test environment set-up. +[----------] 9 tests from SaturatingAddAbsWithThresholdTest/0, where TypeParam = short +[ RUN ] SaturatingAddAbsWithThresholdTest/0.TestPositive +Process 82381 stopped +* thread #1, queue = 'com.apple.main-thread', stop reason = breakpoint 1.2 + frame #0: 0x0000000100695554 kleidicv-api-test`kleidicv_error_t kleidicv::sme::saturating_add_abs_with_threshold(src_a=0x0000600002796762, src_a_stride=46, src_b=0x00006000027967f2, src_b_stride=46, dst=0x0000600002796912, dst_stride=46, width=23, height=3, threshold=50) at add_abs_with_threshold_sme.cpp:15:47 + 12 const T *src_b, size_t src_b_stride, T *dst, + 13 size_t dst_stride, size_t width, + 14 size_t height, T threshold) { +-> 15 return saturating_add_abs_with_threshold_sc(src_a, src_a_stride, src_b, + 16 src_b_stride, dst, dst_stride, + 17 width, height, threshold); + 18 } +(lldb) bt +* thread #1, queue = 'com.apple.main-thread', stop reason = breakpoint 1.2 + * frame #0: 0x0000000100695554 kleidicv-api-test`kleidicv_error_t kleidicv::sme::saturating_add_abs_with_threshold(src_a=0x0000600002796762, src_a_stride=46, src_b=0x00006000027967f2, src_b_stride=46, dst=0x0000600002796912, dst_stride=46, width=23, height=3, threshold=50) at add_abs_with_threshold_sme.cpp:15:47 + frame #1: 0x0000000100009930 kleidicv-api-test`SaturatingAddAbsWithThresholdTestBase::call_api(this=0x000000016fdfe670) at test_add_abs_with_threshold.cpp:17:12 + frame #2: 0x00000001000090c8 kleidicv-api-test`OperationTest::test(this=0x000000016fdfe670) at operation.h:90:11 + frame #3: 0x0000000100008870 kleidicv-api-test`SaturatingAddAbsWithThresholdTest_TestPositive_Test::TestBody(this=0x000060000179e270) at test_add_abs_with_threshold.cpp:135:58 + frame #4: 0x00000001008417cc kleidicv-api-test`void testing::internal::HandleSehExceptionsInMethodIfSupported(object=0x000060000179e270, method=0x00000000000000010000000000000020, location="the test body") at gtest.cc:2599:10 + frame #5: 0x0000000100810908 kleidicv-api-test`void testing::internal::HandleExceptionsInMethodIfSupported(object=0x000060000179e270, method=0x00000000000000010000000000000020, location="the test body") at gtest.cc:2635:14 + frame #6: 0x0000000100810858 kleidicv-api-test`testing::Test::Run(this=0x000060000179e270) at gtest.cc:2674:5 + frame #7: 0x000000010081163c kleidicv-api-test`testing::TestInfo::Run(this=0x000000011fe04290) at gtest.cc:2853:11 + frame #8: 0x00000001008126bc kleidicv-api-test`testing::TestSuite::Run(this=0x000000011fe049d0) at gtest.cc:3012:30 + frame #9: 0x000000010081fdec kleidicv-api-test`testing::internal::UnitTestImpl::RunAllTests(this=0x000000011fe04780) at gtest.cc:5870:44 + frame #10: 0x0000000100845750 kleidicv-api-test`bool testing::internal::HandleSehExceptionsInMethodIfSupported(object=0x000000011fe04780, method=(kleidicv-api-test`testing::internal::UnitTestImpl::RunAllTests() at gtest.cc:5748), location="auxiliary test code (environments or event listeners)") at gtest.cc:2599:10 + frame #11: 0x000000010081f804 kleidicv-api-test`bool testing::internal::HandleExceptionsInMethodIfSupported(object=0x000000011fe04780, method=(kleidicv-api-test`testing::internal::UnitTestImpl::RunAllTests() at gtest.cc:5748), location="auxiliary test code (environments or event listeners)") at gtest.cc:2635:14 + frame #12: 0x000000010081f6fc kleidicv-api-test`testing::UnitTest::Run(this=0x00000001009c92f0) at gtest.cc:5444:10 + frame #13: 0x00000001004e8600 kleidicv-api-test`RUN_ALL_TESTS() at gtest.h:2293:73 + frame #14: 0x00000001004e83a8 kleidicv-api-test`main(argc=1, argv=0x000000016fdff3b0) at test_main.cpp:82:10 + frame #15: 0x000000019f492b98 dyld`start + 6076 +``` + +In the meantime, the "disassemble --frame" command can be used to display the assembly instructions in SME streaming mode, as shown below: + + +```C +disassemble --frame +kleidicv-api-test`kleidicv::sme::saturating_add_abs_with_threshold: + 0x100695510 <+0>: sub sp, sp, #0xa0 + 0x100695514 <+4>: stp d15, d14, [sp, #0x50] + 0x100695518 <+8>: stp d13, d12, [sp, #0x60] + 0x10069551c <+12>: stp d11, d10, [sp, #0x70] + 0x100695520 <+16>: stp d9, d8, [sp, #0x80] + 0x100695524 <+20>: stp x29, x30, [sp, #0x90] + 0x100695528 <+24>: smstart sm + 0x10069552c <+28>: ldrsh w8, [sp, #0xa0] + 0x100695530 <+32>: str x0, [sp, #0x48] + 0x100695534 <+36>: str x1, [sp, #0x40] + 0x100695538 <+40>: str x2, [sp, #0x38] + 0x10069553c <+44>: str x3, [sp, #0x30] + 0x100695540 <+48>: str x4, [sp, #0x28] + 0x100695544 <+52>: str x5, [sp, #0x20] + 0x100695548 <+56>: str x6, [sp, #0x18] + 0x10069554c <+60>: str x7, [sp, #0x10] + 0x100695550 <+64>: strh w8, [sp, #0xe] +-> 0x100695554 <+68>: ldr x0, [sp, #0x48] + 0x100695558 <+72>: ldr x1, [sp, #0x40] + 0x10069555c <+76>: ldr x2, [sp, #0x38] + 0x100695560 <+80>: ldr x3, [sp, #0x30] + 0x100695564 <+84>: ldr x4, [sp, #0x28] + 0x100695568 <+88>: ldr x5, [sp, #0x20] + 0x10069556c <+92>: ldr x6, [sp, #0x18] + 0x100695570 <+96>: ldr x7, [sp, #0x10] + 0x100695574 <+100>: ldrh w8, [sp, #0xe] + 0x100695578 <+104>: mov x9, sp + 0x10069557c <+108>: strh w8, [x9] + 0x100695580 <+112>: bl 0x10087b8d0 ; symbol stub for: kleidicv_error_t kleidicv::sme::saturating_add_abs_with_threshold_sc(short const*, unsigned long, short const*, unsigned long, short*, unsigned long, unsigned long, unsigned long, short) + 0x100695584 <+116>: smstop sm + 0x100695588 <+120>: ldp x29, x30, [sp, #0x90] + 0x10069558c <+124>: ldp d9, d8, [sp, #0x80] + 0x100695590 <+128>: ldp d11, d10, [sp, #0x70] + 0x100695594 <+132>: ldp d13, d12, [sp, #0x60] + 0x100695598 <+136>: ldp d15, d14, [sp, #0x50] + 0x10069559c <+140>: add sp, sp, #0xa0 + 0x1006955a0 <+144>: ret +(lldb) +``` +